#Importing Important Libraries
#Basic EDA Library
import pandas as pd
import numpy as np
#Plotting Library
import matplotlib.pyplot as plt
import seaborn as sns
#Preprocessing Library
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
#Machine Learning Model
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.feature_selection import VarianceThreshold
#Additional Library
from datetime import datetime
# Remove warnings
from warnings import filterwarnings
filterwarnings('ignore')
pd.set_option('display.max_columns', None)
#Loading the dataset
df2 = pd.read_csv('2A.tsv', sep = '\t')
df2.head()
| Customer ID | Birth Date | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1969/03/05 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 2005/07/01 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
| 2 | 3 | 1974/12/01 | Male | Jeans | Clothing | 73 | Massachusetts | S | Maroon | Spring | 3.1 | Yes | Cash | Free Shipping | Yes | Yes | 23 | Credit Card | Weekly |
| 3 | 4 | 2003/11/21 | Male | Sandals | Footwear | 90 | Rhode Island | M | Maroon | Spring | 3.5 | Yes | PayPal | Next Day Air | Yes | Yes | 49 | PayPal | Weekly |
| 4 | 5 | 1979/09/13 | Male | Blouse | Clothing | 49 | Oregon | M | Turquoise | Spring | 2.7 | Yes | Cash | Free Shipping | Yes | Yes | 31 | PayPal | Annually |
#Checking the dataset summary
df2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3900 entries, 0 to 3899 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 3900 non-null int64 1 Birth Date 3900 non-null object 2 Gender 3900 non-null object 3 Item Purchased 3900 non-null object 4 Category 3900 non-null object 5 Purchase Amount (USD) 3900 non-null int64 6 Location 3900 non-null object 7 Size 3900 non-null object 8 Color 3900 non-null object 9 Season 3900 non-null object 10 Review Rating 3900 non-null float64 11 Subscription Status 3900 non-null object 12 Payment Method 3900 non-null object 13 Shipping Type 3900 non-null object 14 Discount Applied 3900 non-null object 15 Promo Code Used 3900 non-null object 16 Previous Purchases 3900 non-null int64 17 Preferred Payment Method 3900 non-null object 18 Frequency of Purchases 3900 non-null object dtypes: float64(1), int64(3), object(15) memory usage: 579.0+ KB
df2.describe()
| Customer ID | Purchase Amount (USD) | Review Rating | Previous Purchases | |
|---|---|---|---|---|
| count | 3900.000000 | 3900.000000 | 3900.000000 | 3900.000000 |
| mean | 1950.500000 | 59.764359 | 3.749949 | 25.351538 |
| std | 1125.977353 | 23.685392 | 0.716223 | 14.447125 |
| min | 1.000000 | 20.000000 | 2.500000 | 1.000000 |
| 25% | 975.750000 | 39.000000 | 3.100000 | 13.000000 |
| 50% | 1950.500000 | 60.000000 | 3.700000 | 25.000000 |
| 75% | 2925.250000 | 81.000000 | 4.400000 | 38.000000 |
| max | 3900.000000 | 100.000000 | 5.000000 | 50.000000 |
#Checking the unique value in each categorical column
df2_cat = df2.select_dtypes(include=['object']).columns
df2_cat = df2_cat.tolist()
df2_cat
for i in df2_cat:
print(df2[i].value_counts())
print()
Birth Date
1970/12/28 3
1965/08/28 3
1966/12/27 3
1970/08/27 3
1969/05/14 3
..
1985/06/23 1
1996/09/02 1
1984/01/21 1
1972/03/05 1
1980/03/30 1
Name: count, Length: 3534, dtype: int64
Gender
Male 2642
Female 1231
f 17
M 10
Name: count, dtype: int64
Item Purchased
Blouse 171
Jewelry 171
Pants 171
Shirt 169
Dress 166
Sweater 164
Jacket 163
Belt 161
Sunglasses 161
Coat 161
Sandals 160
Socks 159
Skirt 158
Shorts 157
Scarf 157
Hat 154
Handbag 153
Hoodie 151
Shoes 150
T-shirt 147
Sneakers 145
Boots 144
Backpack 143
Gloves 140
Jeans 124
Name: count, dtype: int64
Category
Clothing 1737
Accessories 1240
Footwear 599
Outerwear 324
Name: count, dtype: int64
Location
Montana 96
California 95
Idaho 93
Illinois 92
Alabama 89
Minnesota 88
Nebraska 87
New York 87
Nevada 87
Maryland 86
Delaware 86
Vermont 85
Louisiana 84
North Dakota 83
Missouri 81
West Virginia 81
New Mexico 81
Mississippi 80
Indiana 79
Georgia 79
Kentucky 79
Arkansas 79
North Carolina 78
Connecticut 78
Virginia 77
Ohio 77
Tennessee 77
Texas 77
Maine 77
South Carolina 76
Colorado 75
Oklahoma 75
Wisconsin 75
Oregon 74
Pennsylvania 74
Washington 73
Michigan 73
Alaska 72
Massachusetts 72
Wyoming 71
Utah 71
New Hampshire 71
South Dakota 70
Iowa 69
Florida 68
New Jersey 67
Hawaii 65
Arizona 65
Kansas 63
Rhode Island 63
Name: count, dtype: int64
Size
M 1755
L 1053
S 663
XL 429
Name: count, dtype: int64
Color
Olive 177
Yellow 174
Silver 173
Teal 172
Green 169
Black 167
Cyan 166
Violet 166
Gray 159
Maroon 158
Orange 154
Charcoal 153
Pink 153
Magenta 152
Blue 152
Purple 151
Peach 149
Red 148
Beige 147
Indigo 147
Lavender 147
Turquoise 145
White 142
Brown 141
Gold 138
Name: count, dtype: int64
Season
Spring 999
Fall 975
Winter 971
Summer 955
Name: count, dtype: int64
Subscription Status
No 2847
Yes 1053
Name: count, dtype: int64
Payment Method
Credit Card 696
Venmo 653
Cash 648
PayPal 638
Debit Card 633
Bank Transfer 632
Name: count, dtype: int64
Shipping Type
Free Shipping 675
Standard 654
Store Pickup 650
Next Day Air 648
Express 646
2-Day Shipping 627
Name: count, dtype: int64
Discount Applied
No 2223
Yes 1677
Name: count, dtype: int64
Promo Code Used
No 2223
Yes 1677
Name: count, dtype: int64
Preferred Payment Method
PayPal 677
Credit Card 671
Cash 670
Debit Card 636
Venmo 634
Bank Transfer 612
Name: count, dtype: int64
Frequency of Purchases
Every 3 Months 584
Annually 572
Quarterly 563
Monthly 553
Bi-Weekly 547
Fortnightly 542
Weekly 539
Name: count, dtype: int64
# Check for missing values
df2.isnull().sum()
Customer ID 0 Birth Date 0 Gender 0 Item Purchased 0 Category 0 Purchase Amount (USD) 0 Location 0 Size 0 Color 0 Season 0 Review Rating 0 Subscription Status 0 Payment Method 0 Shipping Type 0 Discount Applied 0 Promo Code Used 0 Previous Purchases 0 Preferred Payment Method 0 Frequency of Purchases 0 dtype: int64
#Checking duplicated row
df2.duplicated().sum()
0
Seems like this dataset is clean, the only problem seen is the anomaly value in Gender column
#Evaluating distribution and outlier from the numericla column
#Taking the numerical column
df2_num = df2.select_dtypes(include=['float64', 'int64']).columns.tolist()
def check_distribution_outliers(df, columns):
for i in columns:
plt.figure(figsize=(10, 3))
plt.subplot(1, 2, 1)
sns.histplot(df[i], bins=30)
plt.title(f'Histogram of {i}')
plt.subplot(1, 2, 2)
sns.boxplot(y=df[i])
plt.title(f'Boxplot of {i}')
plt.show()
print(f'Skewness {i}:', df[i].skew())
print(f'Kurtosis {i}:', df[i].kurt())
check_distribution_outliers(df2, df2_num)
Skewness Customer ID: 0.0 Kurtosis Customer ID: -1.1999999999999997
Skewness Purchase Amount (USD): 0.012701757626433795 Kurtosis Purchase Amount (USD): -1.236593691266159
Skewness Review Rating: 0.00452459644246527 Kurtosis Review Rating: -1.1796283021299137
Skewness Previous Purchases: 0.0031211555127652127 Kurtosis Previous Purchases: -1.1901873846405375
Okay no outlier and normal distribution on all numerical category
#Cleaning anomalies in categorical column (especially 'Gender')
val_dict = {
'Female': 'Female',
'f': 'Female',
'Male': 'Male',
'M': 'Male'
}
df2['Gender'] = df2['Gender'].map(val_dict)
df2['Gender'].value_counts()
Gender Male 2652 Female 1248 Name: count, dtype: int64
#Plotting the categorical variable
df2_cat = df2.select_dtypes(include=['object']).columns.tolist()
# Function to plot countplots for categorical features
def plot_categorical_distribution(data, feature):
print(f"{feature} Value Counts:")
print(data[feature].value_counts())
print("\n")
plt.figure(figsize=(8, 4))
sns.countplot(x=feature, data=data, palette='viridis')
plt.xlabel(feature)
plt.ylabel('Count')
plt.title(f'Distribution of {feature}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Plotting the desired categorical features
for col in df2_cat:
plot_categorical_distribution(df2, col)
Birth Date Value Counts:
Birth Date
1970/12/28 3
1965/08/28 3
1966/12/27 3
1970/08/27 3
1969/05/14 3
..
1985/06/23 1
1996/09/02 1
1984/01/21 1
1972/03/05 1
1980/03/30 1
Name: count, Length: 3534, dtype: int64
Gender Value Counts: Gender Male 2652 Female 1248 Name: count, dtype: int64
Item Purchased Value Counts: Item Purchased Blouse 171 Jewelry 171 Pants 171 Shirt 169 Dress 166 Sweater 164 Jacket 163 Belt 161 Sunglasses 161 Coat 161 Sandals 160 Socks 159 Skirt 158 Shorts 157 Scarf 157 Hat 154 Handbag 153 Hoodie 151 Shoes 150 T-shirt 147 Sneakers 145 Boots 144 Backpack 143 Gloves 140 Jeans 124 Name: count, dtype: int64
Category Value Counts: Category Clothing 1737 Accessories 1240 Footwear 599 Outerwear 324 Name: count, dtype: int64
Location Value Counts: Location Montana 96 California 95 Idaho 93 Illinois 92 Alabama 89 Minnesota 88 Nebraska 87 New York 87 Nevada 87 Maryland 86 Delaware 86 Vermont 85 Louisiana 84 North Dakota 83 Missouri 81 West Virginia 81 New Mexico 81 Mississippi 80 Indiana 79 Georgia 79 Kentucky 79 Arkansas 79 North Carolina 78 Connecticut 78 Virginia 77 Ohio 77 Tennessee 77 Texas 77 Maine 77 South Carolina 76 Colorado 75 Oklahoma 75 Wisconsin 75 Oregon 74 Pennsylvania 74 Washington 73 Michigan 73 Alaska 72 Massachusetts 72 Wyoming 71 Utah 71 New Hampshire 71 South Dakota 70 Iowa 69 Florida 68 New Jersey 67 Hawaii 65 Arizona 65 Kansas 63 Rhode Island 63 Name: count, dtype: int64
Size Value Counts: Size M 1755 L 1053 S 663 XL 429 Name: count, dtype: int64
Color Value Counts: Color Olive 177 Yellow 174 Silver 173 Teal 172 Green 169 Black 167 Cyan 166 Violet 166 Gray 159 Maroon 158 Orange 154 Charcoal 153 Pink 153 Magenta 152 Blue 152 Purple 151 Peach 149 Red 148 Beige 147 Indigo 147 Lavender 147 Turquoise 145 White 142 Brown 141 Gold 138 Name: count, dtype: int64
Season Value Counts: Season Spring 999 Fall 975 Winter 971 Summer 955 Name: count, dtype: int64
Subscription Status Value Counts: Subscription Status No 2847 Yes 1053 Name: count, dtype: int64
Payment Method Value Counts: Payment Method Credit Card 696 Venmo 653 Cash 648 PayPal 638 Debit Card 633 Bank Transfer 632 Name: count, dtype: int64
Shipping Type Value Counts: Shipping Type Free Shipping 675 Standard 654 Store Pickup 650 Next Day Air 648 Express 646 2-Day Shipping 627 Name: count, dtype: int64
Discount Applied Value Counts: Discount Applied No 2223 Yes 1677 Name: count, dtype: int64
Promo Code Used Value Counts: Promo Code Used No 2223 Yes 1677 Name: count, dtype: int64
Preferred Payment Method Value Counts: Preferred Payment Method PayPal 677 Credit Card 671 Cash 670 Debit Card 636 Venmo 634 Bank Transfer 612 Name: count, dtype: int64
Frequency of Purchases Value Counts: Frequency of Purchases Every 3 Months 584 Annually 572 Quarterly 563 Monthly 553 Bi-Weekly 547 Fortnightly 542 Weekly 539 Name: count, dtype: int64
We can see some interesting fact from the plot distribution, although there are some "too much to be plotted" variable such as birthdate. But the other distribution might give us some insight such as:
i would love to do further analysis for grouping by gender, but this is clustering case not classification, and i suppose the exploratory insight are sufficient at this point, so let's do it in another time and focus on the clustering instead
#Creating 'Age' column derived from birth date
#(assuming the dataset are documented in 2025)
current_year = datetime.now().year
df2['Birth Date'] = pd.to_datetime(df2['Birth Date'], errors = 'coerce')
df2['Age'] = current_year - df2['Birth Date'].dt.year
df2.head()
| Customer ID | Birth Date | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1969-03-05 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly | 56 |
| 1 | 2 | 2005-07-01 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly | 20 |
| 2 | 3 | 1974-12-01 | Male | Jeans | Clothing | 73 | Massachusetts | S | Maroon | Spring | 3.1 | Yes | Cash | Free Shipping | Yes | Yes | 23 | Credit Card | Weekly | 51 |
| 3 | 4 | 2003-11-21 | Male | Sandals | Footwear | 90 | Rhode Island | M | Maroon | Spring | 3.5 | Yes | PayPal | Next Day Air | Yes | Yes | 49 | PayPal | Weekly | 22 |
| 4 | 5 | 1979-09-13 | Male | Blouse | Clothing | 49 | Oregon | M | Turquoise | Spring | 2.7 | Yes | Cash | Free Shipping | Yes | Yes | 31 | PayPal | Annually | 46 |
#Standardize the "Frecuency of Purchase" Column
#as it cannot be standardize using encoder
#this frquency are mapped per daily count
freq_map = {
"Daily": 365,
"Weekly": 52,
"Bi-Weekly": 26,
"Fortnightly": 26,
"Monthly": 12,
"Every 3 Months": 4,
"Quarterly": 4,
"Annually": 1
}
df2['Frequency of Purchases'] = df2['Frequency of Purchases'].map(freq_map)
df2.head()
| Customer ID | Birth Date | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1969-03-05 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | 26 | 56 |
| 1 | 2 | 2005-07-01 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | 26 | 20 |
| 2 | 3 | 1974-12-01 | Male | Jeans | Clothing | 73 | Massachusetts | S | Maroon | Spring | 3.1 | Yes | Cash | Free Shipping | Yes | Yes | 23 | Credit Card | 52 | 51 |
| 3 | 4 | 2003-11-21 | Male | Sandals | Footwear | 90 | Rhode Island | M | Maroon | Spring | 3.5 | Yes | PayPal | Next Day Air | Yes | Yes | 49 | PayPal | 52 | 22 |
| 4 | 5 | 1979-09-13 | Male | Blouse | Clothing | 49 | Oregon | M | Turquoise | Spring | 2.7 | Yes | Cash | Free Shipping | Yes | Yes | 31 | PayPal | 1 | 46 |
There will be 5 dropped column from this dataset,
df2 = df2.drop(columns = ['Customer ID', 'Birth Date', 'Item Purchased', 'Color', 'Preferred Payment Method'])
df2.head()
| Gender | Category | Purchase Amount (USD) | Location | Size | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | Clothing | 53 | Kentucky | L | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | 26 | 56 |
| 1 | Male | Clothing | 64 | Maine | L | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | 26 | 20 |
| 2 | Male | Clothing | 73 | Massachusetts | S | Spring | 3.1 | Yes | Cash | Free Shipping | Yes | Yes | 23 | 52 | 51 |
| 3 | Male | Footwear | 90 | Rhode Island | M | Spring | 3.5 | Yes | PayPal | Next Day Air | Yes | Yes | 49 | 52 | 22 |
| 4 | Male | Clothing | 49 | Oregon | M | Spring | 2.7 | Yes | Cash | Free Shipping | Yes | Yes | 31 | 1 | 46 |
#Encoing Categorical Variable
#Seeing the category distribution of categorical variable
df2_cat = df2.select_dtypes(include=['object']).columns
df2_cat = df2_cat.tolist()
df2_cat
for i in df2_cat:
print(df2[i].value_counts())
print()
Gender Male 2652 Female 1248 Name: count, dtype: int64 Category Clothing 1737 Accessories 1240 Footwear 599 Outerwear 324 Name: count, dtype: int64 Location Montana 96 California 95 Idaho 93 Illinois 92 Alabama 89 Minnesota 88 Nebraska 87 New York 87 Nevada 87 Maryland 86 Delaware 86 Vermont 85 Louisiana 84 North Dakota 83 Missouri 81 West Virginia 81 New Mexico 81 Mississippi 80 Indiana 79 Georgia 79 Kentucky 79 Arkansas 79 North Carolina 78 Connecticut 78 Virginia 77 Ohio 77 Tennessee 77 Texas 77 Maine 77 South Carolina 76 Colorado 75 Oklahoma 75 Wisconsin 75 Oregon 74 Pennsylvania 74 Washington 73 Michigan 73 Alaska 72 Massachusetts 72 Wyoming 71 Utah 71 New Hampshire 71 South Dakota 70 Iowa 69 Florida 68 New Jersey 67 Hawaii 65 Arizona 65 Kansas 63 Rhode Island 63 Name: count, dtype: int64 Size M 1755 L 1053 S 663 XL 429 Name: count, dtype: int64 Season Spring 999 Fall 975 Winter 971 Summer 955 Name: count, dtype: int64 Subscription Status No 2847 Yes 1053 Name: count, dtype: int64 Payment Method Credit Card 696 Venmo 653 Cash 648 PayPal 638 Debit Card 633 Bank Transfer 632 Name: count, dtype: int64 Shipping Type Free Shipping 675 Standard 654 Store Pickup 650 Next Day Air 648 Express 646 2-Day Shipping 627 Name: count, dtype: int64 Discount Applied No 2223 Yes 1677 Name: count, dtype: int64 Promo Code Used No 2223 Yes 1677 Name: count, dtype: int64
For Clustering, the data need to be encoded into numerical representation as clustering algorithm rely on distance calculations
We can find 3 kind of encoding here, such as:
#Label Encoding
label_col = ['Gender', 'Subscription Status', 'Discount Applied', 'Promo Code Used']
label_encoder = LabelEncoder()
for i in label_col:
df2[i] = label_encoder.fit_transform(df2[i])
df2.head()
| Gender | Category | Purchase Amount (USD) | Location | Size | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Clothing | 53 | Kentucky | L | Winter | 3.1 | 1 | Credit Card | Express | 1 | 1 | 14 | 26 | 56 |
| 1 | 1 | Clothing | 64 | Maine | L | Winter | 3.1 | 1 | Bank Transfer | Express | 1 | 1 | 2 | 26 | 20 |
| 2 | 1 | Clothing | 73 | Massachusetts | S | Spring | 3.1 | 1 | Cash | Free Shipping | 1 | 1 | 23 | 52 | 51 |
| 3 | 1 | Footwear | 90 | Rhode Island | M | Spring | 3.5 | 1 | PayPal | Next Day Air | 1 | 1 | 49 | 52 | 22 |
| 4 | 1 | Clothing | 49 | Oregon | M | Spring | 2.7 | 1 | Cash | Free Shipping | 1 | 1 | 31 | 1 | 46 |
#Ordinal Encoding
size_map = {'M': 0, 'L': 1, 'S': 2, 'XL': 3}
df2['Size'] = df2['Size'].map(size_map)
df2.head()
| Gender | Category | Purchase Amount (USD) | Location | Size | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Clothing | 53 | Kentucky | 1 | Winter | 3.1 | 1 | Credit Card | Express | 1 | 1 | 14 | 26 | 56 |
| 1 | 1 | Clothing | 64 | Maine | 1 | Winter | 3.1 | 1 | Bank Transfer | Express | 1 | 1 | 2 | 26 | 20 |
| 2 | 1 | Clothing | 73 | Massachusetts | 2 | Spring | 3.1 | 1 | Cash | Free Shipping | 1 | 1 | 23 | 52 | 51 |
| 3 | 1 | Footwear | 90 | Rhode Island | 0 | Spring | 3.5 | 1 | PayPal | Next Day Air | 1 | 1 | 49 | 52 | 22 |
| 4 | 1 | Clothing | 49 | Oregon | 0 | Spring | 2.7 | 1 | Cash | Free Shipping | 1 | 1 | 31 | 1 | 46 |
#One Hot Encoding
df2 = pd.get_dummies(df2, columns=['Category', 'Location', 'Season', 'Payment Method', 'Shipping Type'], drop_first=True)
df2.head()
| Gender | Purchase Amount (USD) | Size | Review Rating | Subscription Status | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | Category_Clothing | Category_Footwear | Category_Outerwear | Location_Alaska | Location_Arizona | Location_Arkansas | Location_California | Location_Colorado | Location_Connecticut | Location_Delaware | Location_Florida | Location_Georgia | Location_Hawaii | Location_Idaho | Location_Illinois | Location_Indiana | Location_Iowa | Location_Kansas | Location_Kentucky | Location_Louisiana | Location_Maine | Location_Maryland | Location_Massachusetts | Location_Michigan | Location_Minnesota | Location_Mississippi | Location_Missouri | Location_Montana | Location_Nebraska | Location_Nevada | Location_New Hampshire | Location_New Jersey | Location_New Mexico | Location_New York | Location_North Carolina | Location_North Dakota | Location_Ohio | Location_Oklahoma | Location_Oregon | Location_Pennsylvania | Location_Rhode Island | Location_South Carolina | Location_South Dakota | Location_Tennessee | Location_Texas | Location_Utah | Location_Vermont | Location_Virginia | Location_Washington | Location_West Virginia | Location_Wisconsin | Location_Wyoming | Season_Spring | Season_Summer | Season_Winter | Payment Method_Cash | Payment Method_Credit Card | Payment Method_Debit Card | Payment Method_PayPal | Payment Method_Venmo | Shipping Type_Express | Shipping Type_Free Shipping | Shipping Type_Next Day Air | Shipping Type_Standard | Shipping Type_Store Pickup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 53 | 1 | 3.1 | 1 | 1 | 1 | 14 | 26 | 56 | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | True | False | False | False | True | False | False | False | False |
| 1 | 1 | 64 | 1 | 3.1 | 1 | 1 | 1 | 2 | 26 | 20 | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | True | False | False | False | False |
| 2 | 1 | 73 | 2 | 3.1 | 1 | 1 | 1 | 23 | 52 | 51 | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | True | False | False | False | False | False | True | False | False | False |
| 3 | 1 | 90 | 0 | 3.5 | 1 | 1 | 1 | 49 | 52 | 22 | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | True | False | False | False | True | False | False |
| 4 | 1 | 49 | 0 | 2.7 | 1 | 1 | 1 | 31 | 1 | 46 | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | True | False | False | False | False | False | True | False | False | False |
#The value become True and False, which we avoids due to distance based algorithm of KMeans
#Thus we re-encode them into label using Label Encoder
onehotdf2 = df2.select_dtypes(include=['object', 'category', 'bool']).columns
label_encoder = LabelEncoder()
for i in onehotdf2:
df2[i] = label_encoder.fit_transform(df2[i])
df2.head()
| Gender | Purchase Amount (USD) | Size | Review Rating | Subscription Status | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | Category_Clothing | Category_Footwear | Category_Outerwear | Location_Alaska | Location_Arizona | Location_Arkansas | Location_California | Location_Colorado | Location_Connecticut | Location_Delaware | Location_Florida | Location_Georgia | Location_Hawaii | Location_Idaho | Location_Illinois | Location_Indiana | Location_Iowa | Location_Kansas | Location_Kentucky | Location_Louisiana | Location_Maine | Location_Maryland | Location_Massachusetts | Location_Michigan | Location_Minnesota | Location_Mississippi | Location_Missouri | Location_Montana | Location_Nebraska | Location_Nevada | Location_New Hampshire | Location_New Jersey | Location_New Mexico | Location_New York | Location_North Carolina | Location_North Dakota | Location_Ohio | Location_Oklahoma | Location_Oregon | Location_Pennsylvania | Location_Rhode Island | Location_South Carolina | Location_South Dakota | Location_Tennessee | Location_Texas | Location_Utah | Location_Vermont | Location_Virginia | Location_Washington | Location_West Virginia | Location_Wisconsin | Location_Wyoming | Season_Spring | Season_Summer | Season_Winter | Payment Method_Cash | Payment Method_Credit Card | Payment Method_Debit Card | Payment Method_PayPal | Payment Method_Venmo | Shipping Type_Express | Shipping Type_Free Shipping | Shipping Type_Next Day Air | Shipping Type_Standard | Shipping Type_Store Pickup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 53 | 1 | 3.1 | 1 | 1 | 1 | 14 | 26 | 56 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 1 | 64 | 1 | 3.1 | 1 | 1 | 1 | 2 | 26 | 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 1 | 73 | 2 | 3.1 | 1 | 1 | 1 | 23 | 52 | 51 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 1 | 90 | 0 | 3.5 | 1 | 1 | 1 | 49 | 52 | 22 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 1 | 49 | 0 | 2.7 | 1 | 1 | 1 | 31 | 1 | 46 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
#Scaling
#Using standard scaler because the data is normally distributed and the data contains different unit (Age and Purchase Amount)
scaler = StandardScaler()
numcol = ['Purchase Amount (USD)', 'Review Rating', 'Previous Purchases', 'Frequency of Purchases', 'Age']
for col in numcol:
df2[col] = scaler.fit_transform(df2[[col]])
df2.head()
| Gender | Purchase Amount (USD) | Size | Review Rating | Subscription Status | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | Category_Clothing | Category_Footwear | Category_Outerwear | Location_Alaska | Location_Arizona | Location_Arkansas | Location_California | Location_Colorado | Location_Connecticut | Location_Delaware | Location_Florida | Location_Georgia | Location_Hawaii | Location_Idaho | Location_Illinois | Location_Indiana | Location_Iowa | Location_Kansas | Location_Kentucky | Location_Louisiana | Location_Maine | Location_Maryland | Location_Massachusetts | Location_Michigan | Location_Minnesota | Location_Mississippi | Location_Missouri | Location_Montana | Location_Nebraska | Location_Nevada | Location_New Hampshire | Location_New Jersey | Location_New Mexico | Location_New York | Location_North Carolina | Location_North Dakota | Location_Ohio | Location_Oklahoma | Location_Oregon | Location_Pennsylvania | Location_Rhode Island | Location_South Carolina | Location_South Dakota | Location_Tennessee | Location_Texas | Location_Utah | Location_Vermont | Location_Virginia | Location_Washington | Location_West Virginia | Location_Wisconsin | Location_Wyoming | Season_Spring | Season_Summer | Season_Winter | Payment Method_Cash | Payment Method_Credit Card | Payment Method_Debit Card | Payment Method_PayPal | Payment Method_Venmo | Shipping Type_Express | Shipping Type_Free Shipping | Shipping Type_Next Day Air | Shipping Type_Standard | Shipping Type_Store Pickup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | -0.285629 | 1 | -0.907584 | 1 | 1 | 1 | -0.785831 | 0.507436 | 0.718913 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 1 | 0.178852 | 1 | -0.907584 | 1 | 1 | 1 | -1.616552 | 0.507436 | -1.648629 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 1 | 0.558882 | 2 | -0.907584 | 1 | 1 | 1 | -0.162789 | 2.054365 | 0.390088 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 1 | 1.276716 | 0 | -0.349027 | 1 | 1 | 1 | 1.637107 | 2.054365 | -1.517099 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 1 | -0.454531 | 0 | -1.466141 | 1 | 1 | 1 | 0.391025 | -0.979997 | 0.061263 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
Due to high dimensional of dataset (3900 row x 75 columns), it is best to do PCA to avoid Curse of Dimensionality that often happen in KMeans clustering, caused by high dimensional dataset, because it hinders the algorithm to find meaningful clusters. PCA work by focusing on the components that explain the most variance in the data while retaining the structure. Let's proceed the PCA
pca = PCA(n_components=0.95) #Retaining 95% of variance
df2_pca = pca.fit_transform(df2)
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
df_pca
Explained variance ratio: [0.10619763 0.10062866 0.09848272 0.09667243 0.09314724 0.09224429 0.06746776 0.02751125 0.02453862 0.02373997 0.01728691 0.01680921 0.01643068 0.01631891 0.0161494 0.01584853 0.01555977 0.01544304 0.01226173 0.01111639 0.00746507 0.00600265 0.00536653 0.00271062 0.00266159 0.00235733 0.0023392 0.00228713 0.00225682 0.00217424 0.00215029 0.00214898 0.0021307 0.00211785 0.00210304 0.00208199 0.0020561 0.0020377 0.00200308 0.00199214 0.00198279 0.00197028 0.0019526 0.0019444 0.00194237]
array([[-6.59909549e-01, -1.00780477e-01, 8.57522736e-01, ...,
-4.00714687e-01, -1.06090042e-01, -5.50112069e-01],
[ 1.99373465e-01, -2.16399925e+00, 5.24466670e-01, ...,
7.25984878e-03, -7.82015360e-02, 9.90030479e-02],
[ 2.37583283e-01, 4.85980633e-01, 1.36110401e+00, ...,
-2.08527041e-02, -3.16804644e-03, -1.61264630e-02],
...,
[-8.30010521e-01, -2.12381073e-01, 6.81283190e-01, ...,
-3.32378908e-03, 1.04139025e-02, 3.27873947e-02],
[ 9.04071708e-01, 2.67116139e-01, 9.12562032e-01, ...,
1.80351851e-02, 1.52787594e-02, -2.03419296e-02],
[-1.06022198e+00, 3.80677455e-01, -1.31470434e+00, ...,
1.41131640e-02, -2.24657295e-03, 1.63495516e-04]])
#Choosing the optimal number of cluster K
#Using Elbow Method, Sillhouette Score, and Davies Bouldine
dist = []
sil_score = []
db_scores = []
K_range = range(1,11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state = 123)
kmeans.fit(df2_pca)
labels = kmeans.labels_
dist.append(kmeans.inertia_)
#Sillhouette Score
if k > 1:
sil_score.append(silhouette_score(df2_pca, labels))
else:
sil_score.append(None)
#David Bouildin Score
if k > 1:
db_scores.append(davies_bouldin_score(df2_pca, labels))
else:
db_scores.append(None)
#Plotting Elbow Method
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.plot(K_range, dist, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Distortion (Inertia)')
#Plotting Silhouette
plt.subplot(1, 3, 2)
plt.plot(K_range[1:], sil_score[1:], marker='o', color='g')
plt.title('Silhouette Score for Optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
#Plotting Davies Bouildin
plt.subplot(1, 3, 3)
plt.plot(K_range, db_scores, marker='o', color='r')
plt.title('Davies-Bouldin Score for Optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Davies-Bouldin Score')
plt.tight_layout()
plt.show()
# Determine the optimal K based on the metrics
valid_sil_scores = [(score, K_range[idx]) for idx, score in enumerate(sil_score) if score is not None]
max_sil_score, optimal_k_sil = max(valid_sil_scores, key=lambda x: x[0])
valid_db_scores = [(score, K_range[idx]) for idx, score in enumerate(db_scores) if score is not None]
min_db_score, optimal_k_db = min(valid_db_scores, key=lambda x: x[0])
print(f"Optimal K based on Silhouette Score: {optimal_k_sil}")
print(f"Optimal K based on Davies-Bouldin Score: {optimal_k_db}")
Optimal K based on Silhouette Score: 3 Optimal K based on Davies-Bouldin Score: 7
By this metrics, we can see that in Elbow Method, the elbow can be seen between 3 and 4. In Sillhouette score, the peak is at K = 3, but in Davies Bouldin, the lowest score are found in 7.
In Davies Bouldin, lower score indicate well-separated and compact cluster, but seeing the score lies under 3.0 for K = 3, i think K = 3 will be the most optimal cluster
#Choosing the optimal K
optimal_k = optimal_k_sil
#Choosing K = 3 as in sillhouette score due
#Training the final K-Means model
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
final_labels = final_kmeans.fit_predict(df2_pca)
df2['Cluster'] = final_labels
df2.head()
| Gender | Purchase Amount (USD) | Size | Review Rating | Subscription Status | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | Category_Clothing | Category_Footwear | Category_Outerwear | Location_Alaska | Location_Arizona | Location_Arkansas | Location_California | Location_Colorado | Location_Connecticut | Location_Delaware | Location_Florida | Location_Georgia | Location_Hawaii | Location_Idaho | Location_Illinois | Location_Indiana | Location_Iowa | Location_Kansas | Location_Kentucky | Location_Louisiana | Location_Maine | Location_Maryland | Location_Massachusetts | Location_Michigan | Location_Minnesota | Location_Mississippi | Location_Missouri | Location_Montana | Location_Nebraska | Location_Nevada | Location_New Hampshire | Location_New Jersey | Location_New Mexico | Location_New York | Location_North Carolina | Location_North Dakota | Location_Ohio | Location_Oklahoma | Location_Oregon | Location_Pennsylvania | Location_Rhode Island | Location_South Carolina | Location_South Dakota | Location_Tennessee | Location_Texas | Location_Utah | Location_Vermont | Location_Virginia | Location_Washington | Location_West Virginia | Location_Wisconsin | Location_Wyoming | Season_Spring | Season_Summer | Season_Winter | Payment Method_Cash | Payment Method_Credit Card | Payment Method_Debit Card | Payment Method_PayPal | Payment Method_Venmo | Shipping Type_Express | Shipping Type_Free Shipping | Shipping Type_Next Day Air | Shipping Type_Standard | Shipping Type_Store Pickup | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | -0.285629 | 1 | -0.907584 | 1 | 1 | 1 | -0.785831 | 0.507436 | 0.718913 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 1 | 1 | 0.178852 | 1 | -0.907584 | 1 | 1 | 1 | -1.616552 | 0.507436 | -1.648629 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 |
| 2 | 1 | 0.558882 | 2 | -0.907584 | 1 | 1 | 1 | -0.162789 | 2.054365 | 0.390088 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 3 | 1 | 1.276716 | 0 | -0.349027 | 1 | 1 | 1 | 1.637107 | 2.054365 | -1.517099 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 2 |
| 4 | 1 | -0.454531 | 0 | -1.466141 | 1 | 1 | 1 | 0.391025 | -0.979997 | 0.061263 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
df2['Cluster'].value_counts()
Cluster 1 1468 0 1222 2 1210 Name: count, dtype: int64
plt.figure(figsize=(10, 6))
sns.scatterplot(
x=df2_pca[:, 0], y=df2_pca[:, 1],
hue=final_labels, palette='Set1',
s=50, alpha=0.7
)
plt.scatter(
final_kmeans.cluster_centers_[:, 0], final_kmeans.cluster_centers_[:, 1],
color='black', marker='X', s=200, label='Centroids'
)
plt.title('K-Means Clustering (2D Visualization)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()
The cluster seems overlap significantly, especially near the cluster boundaries, this indicates that the cluster are not well separated, which may suggest that the feature used in clustering do not clearly distinguish the groups, let's refine this by asessing the feature
#Evaluating the variance of feature
selector = VarianceThreshold(threshold=0.1) # Remove low-variance features
selected_features = selector.fit_transform(df2)
selected_columns = df2.columns[selector.get_support()]
print(f"Selected Features: {selected_columns}")
Selected Features: Index(['Gender', 'Purchase Amount (USD)', 'Size', 'Review Rating',
'Subscription Status', 'Discount Applied', 'Promo Code Used',
'Previous Purchases', 'Frequency of Purchases', 'Age',
'Category_Clothing', 'Category_Footwear', 'Season_Spring',
'Season_Summer', 'Season_Winter', 'Payment Method_Cash',
'Payment Method_Credit Card', 'Payment Method_Debit Card',
'Payment Method_PayPal', 'Payment Method_Venmo',
'Shipping Type_Express', 'Shipping Type_Free Shipping',
'Shipping Type_Next Day Air', 'Shipping Type_Standard',
'Shipping Type_Store Pickup', 'Cluster'],
dtype='object')
#Visualize feature distributions within clusters
kmeans = KMeans(n_clusters=3, random_state=42)
df2['Cluster'] = kmeans.fit_predict(selected_features)
for feature in selected_columns:
plt.figure(figsize=(8, 4))
sns.boxplot(x='Cluster', y=feature, data=df2)
plt.title(f"Distribution of {feature} across Clusters")
plt.show()
#Use PCA to reduce dimensions and find key components
pca = PCA(n_components=2)
pca_features = pca.fit_transform(selected_features)
after the feature was selected, we try to do evaluate the sillhouette, elbow, and david score for the model
dist = []
sil_score = []
db_scores = []
K_range = range(1,11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state = 123)
kmeans.fit(pca_features)
labels = kmeans.labels_
dist.append(kmeans.inertia_)
#Sillhouette Score
if k > 1:
sil_score.append(silhouette_score(pca_features, labels))
else:
sil_score.append(None)
#David Bouildin Score
if k > 1:
db_scores.append(davies_bouldin_score(pca_features, labels))
else:
db_scores.append(None)
#Plotting Elbow Method
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.plot(K_range, dist, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Distortion (Inertia)')
#Plotting Silhouette
plt.subplot(1, 3, 2)
plt.plot(K_range[1:], sil_score[1:], marker='o', color='g')
plt.title('Silhouette Score for Optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette Score')
#Plotting Davies Bouildin
plt.subplot(1, 3, 3)
plt.plot(K_range, db_scores, marker='o', color='r')
plt.title('Davies-Bouldin Score for Optimal K')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Davies-Bouldin Score')
plt.tight_layout()
plt.show()
Turns out after selection, the best cluster are K=2
#Re-cluster with selected features and evaluate silhouette score
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(selected_features)
silhouette = silhouette_score(selected_features, clusters)
print(f"Silhouette Score with Selected Features: {silhouette}")
Silhouette Score with Selected Features: 0.11049581937747842
centroids_pca = pca.transform(kmeans.cluster_centers_)
plt.figure(figsize=(10, 6))
sns.scatterplot(
x= pca_features[:, 0], y= pca_features[:, 1],
hue=clusters, palette='Set1',
s=50, alpha=0.7
)
plt.scatter(
centroids_pca[:, 0], centroids_pca[:, 1],
color='black', marker='o', s=200, label='Centroids'
)
plt.title('K-Means Clustering (2D Visualization)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()
#Re-label the cluster
df2['Cluster'] = clusters
df2.head()
| Gender | Purchase Amount (USD) | Size | Review Rating | Subscription Status | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | Category_Clothing | Category_Footwear | Category_Outerwear | Location_Alaska | Location_Arizona | Location_Arkansas | Location_California | Location_Colorado | Location_Connecticut | Location_Delaware | Location_Florida | Location_Georgia | Location_Hawaii | Location_Idaho | Location_Illinois | Location_Indiana | Location_Iowa | Location_Kansas | Location_Kentucky | Location_Louisiana | Location_Maine | Location_Maryland | Location_Massachusetts | Location_Michigan | Location_Minnesota | Location_Mississippi | Location_Missouri | Location_Montana | Location_Nebraska | Location_Nevada | Location_New Hampshire | Location_New Jersey | Location_New Mexico | Location_New York | Location_North Carolina | Location_North Dakota | Location_Ohio | Location_Oklahoma | Location_Oregon | Location_Pennsylvania | Location_Rhode Island | Location_South Carolina | Location_South Dakota | Location_Tennessee | Location_Texas | Location_Utah | Location_Vermont | Location_Virginia | Location_Washington | Location_West Virginia | Location_Wisconsin | Location_Wyoming | Season_Spring | Season_Summer | Season_Winter | Payment Method_Cash | Payment Method_Credit Card | Payment Method_Debit Card | Payment Method_PayPal | Payment Method_Venmo | Shipping Type_Express | Shipping Type_Free Shipping | Shipping Type_Next Day Air | Shipping Type_Standard | Shipping Type_Store Pickup | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | -0.285629 | 1 | -0.907584 | 1 | 1 | 1 | -0.785831 | 0.507436 | 0.718913 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 0.178852 | 1 | -0.907584 | 1 | 1 | 1 | -1.616552 | 0.507436 | -1.648629 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 1 | 0.558882 | 2 | -0.907584 | 1 | 1 | 1 | -0.162789 | 2.054365 | 0.390088 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 1 | 1.276716 | 0 | -0.349027 | 1 | 1 | 1 | 1.637107 | 2.054365 | -1.517099 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 4 | 1 | -0.454531 | 0 | -1.466141 | 1 | 1 | 1 | 0.391025 | -0.979997 | 0.061263 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
Now with feature analysis, the cluster are better separated with less overlaps, it shows clear boundaries between clusters, which indicates that the PCA-reduced dimensions capture the clustering structure effectively
df2.groupby('Cluster').agg(['mean', 'median', 'std'])
| Gender | Purchase Amount (USD) | Size | Review Rating | Subscription Status | Discount Applied | Promo Code Used | Previous Purchases | Frequency of Purchases | Age | Category_Clothing | Category_Footwear | Category_Outerwear | Location_Alaska | Location_Arizona | Location_Arkansas | Location_California | Location_Colorado | Location_Connecticut | Location_Delaware | Location_Florida | Location_Georgia | Location_Hawaii | Location_Idaho | Location_Illinois | Location_Indiana | Location_Iowa | Location_Kansas | Location_Kentucky | Location_Louisiana | Location_Maine | Location_Maryland | Location_Massachusetts | Location_Michigan | Location_Minnesota | Location_Mississippi | Location_Missouri | Location_Montana | Location_Nebraska | Location_Nevada | Location_New Hampshire | Location_New Jersey | Location_New Mexico | Location_New York | Location_North Carolina | Location_North Dakota | Location_Ohio | Location_Oklahoma | Location_Oregon | Location_Pennsylvania | Location_Rhode Island | Location_South Carolina | Location_South Dakota | Location_Tennessee | Location_Texas | Location_Utah | Location_Vermont | Location_Virginia | Location_Washington | Location_West Virginia | Location_Wisconsin | Location_Wyoming | Season_Spring | Season_Summer | Season_Winter | Payment Method_Cash | Payment Method_Credit Card | Payment Method_Debit Card | Payment Method_PayPal | Payment Method_Venmo | Shipping Type_Express | Shipping Type_Free Shipping | Shipping Type_Next Day Air | Shipping Type_Standard | Shipping Type_Store Pickup | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | |
| Cluster | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 0 | 0.690599 | 1.0 | 0.462366 | 0.040878 | 0.052176 | 1.006247 | 0.981405 | 1.0 | 1.038616 | -0.032674 | -0.069748 | 0.998590 | 0.286674 | 0.0 | 0.452325 | 0.443182 | 0.0 | 0.496890 | 0.443182 | 0.0 | 0.496890 | 0.118587 | 0.183345 | 0.987404 | -0.006681 | -0.325526 | 1.007304 | 0.862231 | 0.850444 | 0.510687 | 0.436467 | 0.0 | 0.496075 | 0.164256 | 0.0 | 0.370604 | 0.084194 | 0.0 | 0.277751 | 0.017562 | 0.0 | 0.131387 | 0.017562 | 0.0 | 0.131387 | 0.019628 | 0.0 | 0.138754 | 0.022211 | 0.0 | 0.147406 | 0.018079 | 0.0 | 0.133270 | 0.022727 | 0.0 | 0.149071 | 0.019112 | 0.0 | 0.136953 | 0.013430 | 0.0 | 0.115136 | 0.020145 | 0.0 | 0.140531 | 0.017562 | 0.0 | 0.131387 | 0.022211 | 0.0 | 0.147406 | 0.023244 | 0.0 | 0.150716 | 0.024793 | 0.0 | 0.155535 | 0.012913 | 0.0 | 0.112929 | 0.014979 | 0.0 | 0.121501 | 0.017045 | 0.0 | 0.129474 | 0.025826 | 0.0 | 0.158658 | 0.017562 | 0.0 | 0.131387 | 0.023760 | 0.0 | 0.152341 | 0.018595 | 0.0 | 0.135125 | 0.016012 | 0.0 | 0.125555 | 0.023760 | 0.0 | 0.152341 | 0.020145 | 0.0 | 0.140531 | 0.021178 | 0.0 | 0.144014 | 0.023244 | 0.0 | 0.150716 | 0.022211 | 0.0 | 0.147406 | 0.020661 | 0.0 | 0.142284 | 0.019112 | 0.0 | 0.136953 | 0.018079 | 0.0 | 0.133270 | 0.021178 | 0.0 | 0.144014 | 0.023244 | 0.0 | 0.150716 | 0.022727 | 0.0 | 0.149071 | 0.020145 | 0.0 | 0.140531 | 0.024277 | 0.0 | 0.153947 | 0.019628 | 0.0 | 0.138754 | 0.019628 | 0.0 | 0.138754 | 0.018595 | 0.0 | 0.135125 | 0.019112 | 0.0 | 0.136953 | 0.023760 | 0.0 | 0.152341 | 0.018079 | 0.0 | 0.133270 | 0.020661 | 0.0 | 0.142284 | 0.018079 | 0.0 | 0.133270 | 0.018079 | 0.0 | 0.133270 | 0.021178 | 0.0 | 0.144014 | 0.017562 | 0.0 | 0.131387 | 0.018079 | 0.0 | 0.133270 | 0.020145 | 0.0 | 0.140531 | 0.021178 | 0.0 | 0.144014 | 0.018079 | 0.0 | 0.133270 | 0.264463 | 0.0 | 0.441161 | 0.242252 | 0.0 | 0.428557 | 0.236570 | 0.0 | 0.425086 | 0.170455 | 0.0 | 0.376129 | 0.177686 | 0.0 | 0.382347 | 0.154959 | 0.0 | 0.361959 | 0.162190 | 0.0 | 0.368720 | 0.170971 | 0.0 | 0.376581 | 0.177169 | 0.0 | 0.381911 | 0.167872 | 0.0 | 0.373849 | 0.155992 | 0.0 | 0.362941 | 0.179752 | 0.0 | 0.384080 | 0.161674 | 0.0 | 0.368246 |
| 1 | 0.669552 | 1.0 | 0.470494 | -0.040295 | -0.032275 | 0.992669 | 0.899185 | 1.0 | 1.015930 | 0.032209 | 0.069891 | 1.000853 | 0.253564 | 0.0 | 0.435162 | 0.417006 | 0.0 | 0.493189 | 0.417006 | 0.0 | 0.493189 | -0.116897 | -0.162789 | 0.999076 | 0.006586 | -0.325526 | 0.993217 | -0.849939 | -0.859448 | 0.523158 | 0.454175 | 0.0 | 0.498022 | 0.143075 | 0.0 | 0.350239 | 0.081976 | 0.0 | 0.274397 | 0.019348 | 0.0 | 0.137781 | 0.015784 | 0.0 | 0.124671 | 0.020876 | 0.0 | 0.143005 | 0.026477 | 0.0 | 0.160589 | 0.020367 | 0.0 | 0.141287 | 0.017312 | 0.0 | 0.130463 | 0.024949 | 0.0 | 0.156010 | 0.021385 | 0.0 | 0.144701 | 0.020367 | 0.0 | 0.141287 | 0.015784 | 0.0 | 0.124671 | 0.025458 | 0.0 | 0.157552 | 0.023931 | 0.0 | 0.152872 | 0.015784 | 0.0 | 0.124671 | 0.022403 | 0.0 | 0.148029 | 0.017312 | 0.0 | 0.130463 | 0.023422 | 0.0 | 0.151277 | 0.017312 | 0.0 | 0.130463 | 0.021894 | 0.0 | 0.146375 | 0.020367 | 0.0 | 0.141287 | 0.018330 | 0.0 | 0.134176 | 0.021385 | 0.0 | 0.144701 | 0.021385 | 0.0 | 0.144701 | 0.020876 | 0.0 | 0.143005 | 0.020367 | 0.0 | 0.141287 | 0.025967 | 0.0 | 0.159079 | 0.022403 | 0.0 | 0.148029 | 0.023931 | 0.0 | 0.152872 | 0.017312 | 0.0 | 0.130463 | 0.016293 | 0.0 | 0.126633 | 0.020367 | 0.0 | 0.141287 | 0.021385 | 0.0 | 0.144701 | 0.017312 | 0.0 | 0.130463 | 0.022403 | 0.0 | 0.148029 | 0.015275 | 0.0 | 0.122676 | 0.018839 | 0.0 | 0.135991 | 0.018330 | 0.0 | 0.134176 | 0.019348 | 0.0 | 0.137781 | 0.013238 | 0.0 | 0.114323 | 0.015275 | 0.0 | 0.122676 | 0.017821 | 0.0 | 0.132333 | 0.018839 | 0.0 | 0.135991 | 0.021385 | 0.0 | 0.144701 | 0.018330 | 0.0 | 0.134176 | 0.022403 | 0.0 | 0.148029 | 0.021894 | 0.0 | 0.146375 | 0.019348 | 0.0 | 0.137781 | 0.021385 | 0.0 | 0.144701 | 0.017312 | 0.0 | 0.130463 | 0.018330 | 0.0 | 0.134176 | 0.247963 | 0.0 | 0.431940 | 0.247454 | 0.0 | 0.431643 | 0.261202 | 0.0 | 0.439401 | 0.161914 | 0.0 | 0.368466 | 0.179226 | 0.0 | 0.383639 | 0.169552 | 0.0 | 0.375334 | 0.164969 | 0.0 | 0.371247 | 0.163951 | 0.0 | 0.370326 | 0.154277 | 0.0 | 0.361306 | 0.178208 | 0.0 | 0.382785 | 0.176171 | 0.0 | 0.381063 | 0.155804 | 0.0 | 0.362762 | 0.171589 | 0.0 | 0.377118 |
import matplotlib.pyplot as plt
import seaborn as sns
def analyze_clusters(df, columns, cluster_column='Cluster'):
num_columns = len(columns)
rows = (num_columns + 2) // 3
fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))
axes = axes.flatten()
for i, column in enumerate(columns):
sns.boxplot(data=df, x=cluster_column, y=column, ax=axes[i], palette='Set3')
axes[i].set_title(f'{column} Distribution by {cluster_column}')
axes[i].set_xlabel(cluster_column)
axes[i].set_ylabel(column)
# Hide unused subplots if any
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
analyze_clusters(df2, anacol)
import matplotlib.pyplot as plt
import seaborn as sns
def analyze_clusters(df, columns, cluster_column='Cluster'):
num_columns = len(columns)
rows = (num_columns + 2) // 3
fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))
axes = axes.flatten()
for i, column in enumerate(columns):
sns.barplot(x=cluster_column, y=column, data=df, ax=axes[i], palette = 'viridis')
axes[i].set_title(f'{column} Distribution by {cluster_column}')
axes[i].set_xlabel(cluster_column)
axes[i].set_ylabel(column)
# Hide unused subplots if any
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
anacol = df2.drop(columns='Cluster').columns
analyze_clusters(df2, anacol)
Usage distribution analysis to understand the cluster characteristic
Feature with high variation among clusters are likely the most important for segmenting the customers, it's more likely to be more influential in defininig the cluster's characeristic. This feature couls be differentiate by mean of a feature per cluster, the bar represent the averagre value of the feature for that specific cluster, while the boxplot could gice us insight of the data distribution for each cluster. This can be interpreted by seeing the distinctiveess between bar heights
Let's analyze these plot thoroughly:
similar with Purchase Amount, Age, Previous Purchases, Frequency of Purchases and Review Ratings** also have quite distinctiveness between it's distribution, let's break it down one by one.
Size distribution plot shows that cluster 0 has the higest average size suggesting the customer in this cluster tend to have larger size, while cluster 1 has the smaller average size indicating smaller cluster of size of customer
Other feature may be interpreted as same as how i interpret it above, but the raw conclusion we can take from some of the feature are:
Summary and Insights
======================================================================
Using PCA, PCA component can suggest what features are driving the variability in the data. In the clustering, PC 1 and 2 are usually used such because PC 1 explains 50% of the variance, meaning this component captures 50% of the total information from the data, PC2 explain 30% variance, so it capture additional 30%. These data combined together explain the 80% of the variance in the data
explained_variance = pca.explained_variance_ratio_
pca_components = pd.DataFrame(pca.components_, index=[f'PC{i+1}' for i in range(len(explained_variance))])
pca_components
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| PC1 | 0.007715 | 0.270093 | 0.756556 | 0.513502 | -0.010000 | -0.008683 | -0.008683 | -0.091775 | -0.068350 | -0.276721 | -0.005310 | 0.003771 | -0.006177 | 0.003086 | 0.002686 | -0.001373 | 0.001520 | -0.000853 | -0.003603 | -0.000492 | -0.006063 | -4.048212e-03 | -0.002946 | 0.002052 | 0.000515 | 0.002178 | 0.003177 | 0.000264 | -0.000078 | -0.004170 | 0.001036 | -0.002280 | 0.001000 | 0.001911 | -0.004273 | 0.004835 | -0.002758 | 0.003262 | 0.001785 | 0.000178 | -0.001013 | -0.000715 | -0.002600 | 0.000925 | -0.000180 | 0.002646 | 0.001349 | -0.002777 | -0.001295 | 0.002696 | -0.000053 | 0.000822 | -0.000342 | 0.000601 | 0.001392 | -0.001242 | -0.000616 | 0.001206 | 0.000465 | -0.000529 | 0.001292 | 0.000946 | 0.004905 | -0.017813 | 0.011723 | -0.000740 | -0.001329 | 0.006708 | 0.004526 | -0.007591 | 0.006828 | -0.003260 | -0.003782 | 0.001007 | -0.000683 |
| PC2 | 0.034786 | 0.066986 | 0.244917 | 0.078778 | 0.029204 | 0.034655 | 0.034655 | 0.723285 | 0.033748 | 0.631862 | -0.012991 | 0.003526 | -0.002088 | 0.002593 | 0.004196 | 0.001376 | -0.002430 | -0.001617 | 0.000311 | -0.002079 | -0.002556 | -1.497902e-03 | 0.001748 | -0.001954 | 0.001161 | 0.005262 | -0.000081 | -0.002615 | -0.000464 | 0.000770 | -0.004703 | 0.003479 | -0.000551 | 0.000022 | 0.001183 | 0.001584 | 0.003830 | 0.000214 | -0.001473 | 0.000503 | 0.001624 | -0.001012 | -0.000109 | -0.000974 | 0.000889 | -0.001269 | 0.002318 | -0.002855 | -0.000489 | 0.001855 | 0.000318 | 0.003981 | -0.000811 | 0.000665 | -0.005265 | 0.000626 | -0.002587 | -0.003598 | -0.000475 | -0.003057 | -0.000618 | 0.001879 | 0.004996 | -0.005647 | 0.002246 | 0.000602 | -0.006716 | -0.005943 | -0.001409 | 0.009998 | 0.011823 | -0.007954 | -0.011130 | 0.014360 | -0.010695 |
| PC3 | 0.020987 | -0.703759 | 0.448649 | -0.222678 | 0.016696 | 0.026973 | 0.026973 | -0.133667 | 0.479752 | 0.051235 | -0.001096 | -0.002532 | 0.005686 | -0.003608 | -0.002410 | -0.001898 | -0.000798 | 0.001602 | 0.002197 | 0.001202 | -0.001637 | 1.838183e-03 | 0.002172 | -0.001075 | 0.002315 | 0.003896 | -0.000816 | 0.002802 | 0.003015 | 0.002425 | 0.000154 | 0.002252 | 0.002296 | -0.003900 | 0.000870 | 0.001835 | -0.000836 | -0.000485 | 0.000729 | -0.002419 | -0.000222 | 0.000651 | -0.003003 | 0.001381 | 0.000142 | -0.002330 | 0.000148 | 0.002994 | 0.001678 | -0.003419 | 0.000261 | 0.000085 | -0.000322 | -0.000399 | -0.002631 | -0.003317 | 0.000307 | -0.000760 | -0.003810 | -0.001117 | 0.000752 | 0.000598 | 0.011886 | 0.002207 | 0.001855 | -0.006924 | -0.006949 | 0.012804 | 0.012613 | -0.011827 | 0.000722 | -0.001366 | 0.008247 | 0.001653 | 0.000463 |
| PC4 | -0.004048 | 0.256960 | -0.269824 | 0.293188 | 0.007957 | -0.001998 | -0.001998 | 0.176073 | 0.836674 | -0.206437 | -0.005821 | 0.007487 | -0.001019 | 0.004285 | -0.000872 | 0.000411 | 0.000086 | -0.002568 | -0.004665 | -0.002164 | 0.000208 | 2.823227e-03 | 0.004771 | -0.002432 | 0.001975 | 0.000322 | 0.002404 | 0.000068 | 0.001032 | 0.002879 | -0.000708 | 0.000289 | 0.000582 | -0.000929 | 0.001188 | -0.000843 | -0.001044 | -0.001523 | -0.001606 | 0.000730 | -0.003598 | -0.002457 | 0.001679 | 0.001629 | 0.002606 | -0.003018 | -0.000553 | 0.000720 | 0.002025 | 0.002832 | 0.000006 | -0.005462 | -0.001565 | 0.003348 | 0.001566 | -0.001039 | -0.002969 | 0.001696 | -0.000401 | -0.000828 | -0.003472 | 0.001318 | 0.004281 | -0.013926 | 0.016940 | -0.015342 | 0.000996 | -0.007885 | 0.007698 | 0.011759 | 0.004831 | -0.000302 | -0.005028 | 0.005020 | -0.008570 |
| PC5 | -0.017164 | 0.595778 | 0.266676 | -0.593259 | -0.011780 | -0.016075 | -0.016075 | -0.311209 | 0.239294 | 0.255107 | 0.017134 | -0.002954 | -0.006564 | 0.002384 | 0.003909 | -0.000229 | -0.002891 | -0.001203 | -0.001185 | -0.004342 | -0.003952 | 2.031499e-03 | -0.001117 | -0.000523 | 0.003698 | 0.001617 | -0.002751 | -0.002093 | -0.002875 | 0.001077 | -0.003163 | -0.003071 | 0.003566 | -0.002295 | -0.002742 | 0.001738 | -0.002839 | -0.000606 | 0.000188 | 0.002156 | 0.001008 | -0.001364 | 0.000065 | 0.001905 | 0.000680 | 0.002103 | 0.000672 | 0.003052 | -0.001617 | 0.002321 | 0.002681 | -0.000465 | 0.001591 | 0.001113 | -0.001354 | 0.000549 | -0.001817 | 0.002698 | 0.000332 | 0.005580 | -0.003225 | 0.000477 | -0.011590 | -0.011469 | 0.004334 | 0.000086 | 0.015385 | -0.000527 | -0.005712 | 0.004232 | 0.005062 | 0.012140 | -0.001099 | -0.020636 | 0.007057 |
| PC6 | -0.027434 | -0.047293 | -0.142876 | 0.489723 | -0.024813 | -0.033703 | -0.033703 | -0.560414 | 0.072855 | 0.642076 | -0.016513 | 0.010724 | 0.003905 | -0.002508 | -0.002649 | -0.001694 | 0.000833 | 0.000858 | 0.003702 | 0.000500 | -0.001766 | -1.110743e-03 | -0.004005 | 0.000556 | -0.004084 | 0.003338 | -0.003876 | 0.001888 | -0.002273 | 0.007009 | 0.001973 | 0.003215 | 0.001912 | -0.002106 | 0.001514 | -0.002346 | -0.000751 | 0.000367 | -0.001554 | -0.001914 | -0.003344 | 0.002292 | -0.000405 | 0.000839 | 0.003200 | 0.000362 | 0.002935 | -0.000054 | 0.000026 | -0.002561 | 0.002722 | 0.001883 | -0.000498 | 0.000272 | 0.004749 | -0.003082 | -0.001426 | -0.001298 | 0.001503 | -0.002838 | 0.005302 | -0.004470 | 0.022373 | -0.005892 | -0.019436 | 0.003587 | 0.010637 | -0.011743 | -0.004022 | 0.005135 | 0.009192 | 0.000280 | -0.005209 | 0.009610 | -0.009111 |
| PC7 | 0.399711 | 0.047365 | -0.029348 | 0.022300 | 0.418517 | 0.571233 | 0.571233 | -0.086289 | -0.012914 | -0.001332 | -0.013442 | 0.003375 | 0.003766 | -0.001322 | -0.003724 | 0.000970 | 0.000647 | -0.000647 | -0.004352 | 0.001987 | 0.001349 | -1.440437e-03 | 0.002289 | -0.002165 | -0.002116 | 0.005620 | 0.003653 | -0.008036 | -0.000434 | 0.000123 | -0.002901 | -0.000486 | 0.003687 | -0.001717 | 0.000648 | 0.001213 | 0.003718 | -0.003606 | -0.000028 | 0.002537 | -0.000270 | -0.001264 | 0.001677 | -0.000431 | 0.001296 | 0.002117 | -0.000054 | 0.002707 | 0.002595 | 0.000167 | -0.000756 | 0.003591 | -0.001868 | -0.003654 | -0.003381 | 0.000401 | -0.001285 | -0.000780 | 0.000674 | 0.004115 | 0.000630 | -0.000242 | 0.006349 | 0.008873 | -0.004727 | 0.009857 | -0.000295 | -0.001645 | -0.008414 | 0.002933 | 0.005707 | -0.013468 | 0.010229 | -0.001274 | 0.009030 |
| PC8 | -0.027806 | 0.009826 | 0.005920 | -0.022956 | 0.003204 | -0.004218 | -0.004218 | -0.002359 | -0.000181 | -0.013181 | -0.884099 | 0.389617 | 0.133505 | -0.002917 | -0.002770 | -0.002166 | -0.005780 | 0.001250 | 0.001327 | -0.002394 | -0.000084 | -4.736651e-03 | 0.000007 | 0.000654 | -0.005039 | -0.003247 | 0.007957 | 0.004232 | 0.000007 | -0.008908 | 0.000919 | 0.005440 | -0.000307 | 0.004319 | -0.002171 | 0.000809 | -0.001312 | -0.001993 | 0.009048 | 0.001290 | 0.000548 | 0.004944 | 0.007173 | 0.002534 | -0.000772 | 0.004254 | 0.002700 | -0.005715 | -0.004080 | 0.000005 | -0.008192 | -0.002059 | 0.003531 | -0.000580 | 0.006521 | -0.000567 | -0.005068 | 0.000522 | -0.002365 | 0.006576 | -0.002528 | -0.000806 | -0.045546 | 0.155341 | -0.113487 | 0.020673 | -0.050617 | -0.007931 | 0.032682 | 0.013551 | -0.015404 | 0.042324 | -0.019049 | -0.017662 | 0.014183 |
| PC9 | -0.010425 | 0.010110 | -0.000245 | -0.019258 | 0.001597 | -0.001306 | -0.001306 | 0.013277 | -0.001874 | -0.020809 | -0.009454 | 0.020512 | -0.000342 | 0.009418 | 0.001159 | 0.005137 | -0.000645 | -0.000309 | -0.000061 | 0.000835 | -0.004419 | -1.052438e-03 | -0.001465 | 0.005080 | 0.003012 | 0.001027 | -0.004220 | 0.004942 | 0.003640 | -0.000400 | -0.000367 | 0.005168 | -0.003569 | 0.000952 | 0.001468 | -0.007858 | 0.005394 | -0.001556 | 0.003465 | 0.007275 | 0.001114 | 0.007756 | -0.000941 | -0.007727 | 0.007730 | -0.003685 | -0.011413 | -0.006061 | 0.000607 | -0.001994 | -0.002856 | 0.010892 | -0.000431 | 0.003511 | -0.004520 | -0.005251 | -0.012228 | -0.006306 | 0.004845 | 0.002260 | -0.010893 | -0.000126 | 0.800585 | -0.261524 | -0.529683 | 0.014968 | 0.010942 | -0.014100 | -0.026689 | -0.013324 | 0.034792 | -0.014760 | -0.048814 | -0.026839 | 0.033276 |
| PC10 | -0.006300 | 0.014876 | 0.015240 | 0.014706 | 0.004760 | -0.003319 | -0.003319 | 0.006177 | 0.020189 | -0.003307 | 0.186427 | -0.068474 | -0.039996 | -0.004672 | 0.005370 | 0.002663 | 0.002324 | -0.003975 | -0.007458 | 0.000306 | -0.000297 | -1.875513e-03 | -0.000085 | 0.007839 | 0.002689 | -0.002112 | -0.002041 | 0.002270 | -0.011730 | -0.005629 | -0.010246 | 0.004197 | 0.002380 | 0.006143 | -0.001515 | -0.001904 | 0.003162 | 0.002813 | 0.000484 | 0.008768 | -0.008217 | 0.005162 | 0.000594 | -0.001763 | -0.000764 | 0.007463 | 0.000966 | -0.008426 | -0.003672 | 0.011269 | -0.005611 | 0.002801 | 0.004219 | 0.000791 | 0.001146 | 0.001923 | -0.007417 | -0.000299 | 0.003729 | -0.005017 | 0.006933 | -0.008295 | -0.149903 | 0.750350 | -0.604075 | -0.006980 | -0.035272 | 0.012402 | 0.011151 | 0.004673 | -0.028611 | 0.037408 | -0.039707 | 0.022525 | 0.012285 |
| PC11 | -0.006062 | -0.009862 | 0.000992 | 0.005052 | 0.046015 | -0.011715 | -0.011715 | 0.012533 | 0.003656 | -0.001605 | -0.051346 | -0.032451 | 0.001352 | -0.002081 | -0.001232 | -0.006137 | -0.005673 | -0.002184 | -0.003790 | -0.006495 | -0.001923 | 1.832567e-03 | -0.001925 | -0.005222 | 0.012827 | 0.001451 | 0.008295 | -0.000054 | 0.006000 | -0.008530 | -0.003962 | -0.003524 | -0.002516 | 0.002578 | -0.003849 | 0.005963 | -0.001182 | 0.008184 | -0.005455 | -0.002219 | 0.009555 | 0.000017 | 0.006021 | -0.002855 | 0.003002 | 0.006045 | 0.002921 | 0.000927 | 0.001665 | -0.002857 | 0.004353 | -0.002533 | -0.007636 | 0.008353 | 0.000071 | 0.003923 | -0.001181 | -0.002326 | 0.000965 | -0.001096 | -0.005214 | 0.001727 | -0.034961 | 0.019820 | 0.009152 | -0.084917 | 0.754349 | -0.084601 | -0.186416 | -0.386757 | -0.101220 | 0.066818 | -0.318126 | 0.068529 | 0.320012 |
| PC12 | -0.079024 | -0.000526 | -0.001533 | 0.010544 | 0.034910 | 0.028828 | 0.028828 | 0.007441 | 0.002772 | 0.006891 | 0.067119 | 0.031215 | -0.002103 | 0.002261 | -0.001515 | 0.001050 | -0.000201 | 0.006943 | -0.014382 | 0.007667 | 0.004560 | 3.745966e-03 | -0.001668 | 0.004548 | -0.002068 | -0.001227 | -0.006847 | 0.003079 | -0.005051 | -0.001596 | 0.008685 | 0.006395 | -0.001157 | 0.004812 | 0.000934 | 0.000618 | -0.004191 | 0.015519 | -0.006502 | 0.003421 | -0.001870 | -0.000394 | 0.002301 | 0.006781 | 0.003573 | 0.002233 | 0.006267 | -0.007244 | -0.001838 | 0.001862 | -0.007243 | -0.002823 | -0.009373 | -0.004484 | 0.007350 | -0.001923 | -0.004409 | 0.000407 | -0.002034 | 0.004670 | 0.001000 | -0.009429 | 0.021850 | -0.043119 | 0.032448 | 0.293610 | -0.170758 | 0.015619 | 0.151763 | -0.280356 | -0.295806 | 0.774306 | -0.175461 | -0.150927 | -0.149873 |
| PC13 | 0.090728 | 0.013527 | -0.003424 | 0.008549 | -0.071184 | -0.008857 | -0.008857 | 0.006555 | 0.002498 | 0.016873 | 0.005682 | 0.000546 | -0.023230 | -0.011840 | -0.001079 | 0.001532 | -0.000578 | 0.008094 | 0.006433 | -0.003019 | -0.014511 | -3.822618e-03 | 0.004913 | 0.008159 | -0.001993 | 0.007850 | -0.001017 | 0.004433 | -0.006846 | 0.002643 | -0.010753 | -0.007510 | 0.016015 | 0.001207 | -0.000403 | 0.004072 | 0.001986 | 0.007934 | -0.000043 | -0.002960 | 0.001937 | 0.000761 | -0.000283 | -0.005896 | -0.001023 | -0.013730 | 0.008809 | 0.004363 | 0.002151 | -0.001469 | -0.003635 | -0.000386 | 0.002306 | 0.001969 | -0.002427 | -0.005214 | -0.000987 | -0.002238 | -0.000841 | 0.004582 | -0.002602 | -0.000465 | 0.029848 | 0.004908 | -0.016857 | -0.023301 | 0.002105 | -0.199378 | 0.503855 | -0.279720 | -0.464259 | -0.205805 | 0.486011 | -0.130155 | 0.314738 |
| PC14 | 0.028497 | 0.010741 | -0.006902 | -0.003569 | -0.006558 | -0.016625 | -0.016625 | -0.006692 | 0.015701 | 0.002694 | 0.006622 | -0.010869 | 0.013855 | 0.002550 | 0.004483 | -0.005333 | 0.000181 | 0.015325 | -0.000212 | -0.007114 | -0.005070 | -6.874941e-03 | 0.003182 | 0.006476 | 0.013803 | 0.002431 | 0.001864 | -0.002832 | -0.002387 | -0.004965 | -0.005006 | 0.010677 | 0.005262 | 0.000289 | 0.008052 | -0.006249 | -0.003762 | 0.001207 | 0.005857 | -0.002166 | 0.001784 | 0.005482 | 0.005244 | -0.001926 | 0.005074 | -0.003090 | 0.006587 | 0.007446 | -0.009498 | -0.009348 | 0.004786 | -0.007210 | -0.000125 | -0.004610 | 0.000602 | -0.017112 | -0.003283 | -0.000089 | -0.010963 | 0.001581 | 0.004259 | -0.002548 | -0.013371 | -0.004364 | 0.034614 | 0.562171 | -0.365987 | 0.201434 | -0.138193 | -0.273115 | -0.075875 | -0.329059 | -0.273321 | 0.301615 | 0.361186 |
| PC15 | -0.042415 | 0.009695 | 0.009247 | -0.019924 | -0.018540 | 0.026601 | 0.026601 | -0.010191 | -0.003230 | -0.008970 | -0.012427 | 0.032021 | -0.022661 | 0.009474 | -0.004055 | -0.002454 | 0.001083 | 0.001397 | -0.001067 | -0.005916 | 0.002731 | -8.742174e-03 | -0.005737 | -0.004572 | -0.009773 | -0.010208 | -0.005050 | 0.000323 | -0.005258 | 0.002046 | -0.001808 | 0.008147 | 0.008260 | 0.008044 | 0.000150 | -0.001482 | 0.006927 | 0.005464 | -0.001454 | 0.007997 | -0.006481 | 0.000006 | -0.001747 | 0.004409 | 0.000352 | -0.005942 | -0.001712 | -0.006581 | 0.005967 | 0.003561 | 0.002384 | 0.004037 | -0.000319 | 0.002678 | -0.003723 | 0.011465 | 0.013514 | 0.004421 | -0.002585 | -0.005509 | 0.005260 | -0.004496 | 0.035976 | -0.029823 | -0.022503 | -0.047944 | 0.092237 | -0.136349 | 0.018752 | 0.062110 | -0.402846 | -0.016435 | 0.047440 | 0.788239 | -0.412315 |
| PC16 | -0.028005 | 0.015416 | -0.014340 | 0.000761 | 0.041634 | 0.002304 | 0.002304 | -0.003557 | -0.008104 | 0.013843 | -0.012596 | -0.000504 | 0.011318 | 0.000624 | 0.002079 | 0.006800 | -0.003997 | 0.003862 | -0.001429 | -0.012331 | 0.000031 | -7.633282e-03 | -0.000727 | 0.007957 | 0.004539 | 0.003647 | 0.002634 | -0.004524 | 0.007527 | 0.004808 | 0.006259 | 0.000633 | -0.007946 | -0.004462 | 0.002414 | 0.002693 | 0.006723 | 0.013453 | -0.001927 | 0.000840 | -0.001710 | 0.001065 | -0.014438 | -0.009177 | 0.006763 | 0.006524 | -0.007129 | 0.001776 | -0.006301 | -0.012899 | 0.001663 | 0.009986 | -0.000592 | 0.004991 | 0.001943 | 0.001776 | -0.001858 | -0.001047 | -0.000572 | -0.005743 | -0.001082 | -0.001532 | 0.005539 | -0.036273 | -0.011967 | -0.527105 | -0.120884 | 0.727808 | 0.225718 | -0.313868 | -0.000957 | -0.006884 | -0.086267 | 0.116435 | -0.025086 |
| PC17 | -0.077449 | -0.007079 | 0.008711 | 0.008918 | 0.027221 | 0.019405 | 0.019405 | -0.001815 | -0.001375 | 0.005997 | 0.026930 | 0.015061 | -0.012300 | -0.000377 | -0.000937 | -0.009445 | -0.004044 | 0.010413 | 0.005325 | 0.004617 | -0.000834 | 1.519758e-02 | -0.002244 | -0.001406 | 0.002724 | 0.005933 | 0.002485 | -0.007935 | -0.007310 | 0.001742 | 0.000644 | -0.010971 | -0.006248 | -0.001507 | -0.003549 | 0.005896 | -0.004692 | 0.003088 | 0.006780 | -0.000357 | -0.003698 | -0.005874 | -0.008401 | -0.000729 | 0.008574 | -0.006765 | 0.006180 | -0.003847 | -0.006080 | -0.001226 | 0.000795 | -0.003717 | -0.002181 | 0.002776 | 0.007309 | 0.003975 | 0.002056 | -0.006049 | 0.005621 | -0.002516 | 0.002435 | 0.009935 | 0.026103 | -0.020576 | 0.044328 | -0.273005 | -0.150014 | 0.068736 | -0.191258 | 0.552058 | -0.488344 | 0.158278 | -0.174699 | -0.006099 | 0.499323 |
| PC18 | -0.022087 | 0.003095 | 0.001037 | 0.013105 | -0.008564 | 0.003105 | 0.003105 | 0.011532 | 0.012669 | 0.005433 | -0.043711 | -0.045058 | 0.033259 | 0.000890 | 0.003935 | 0.004205 | 0.007501 | 0.008606 | 0.006395 | 0.002136 | -0.001402 | -9.137704e-03 | -0.006138 | -0.000860 | -0.002247 | -0.010597 | -0.003922 | -0.006237 | -0.001999 | 0.007412 | -0.010732 | 0.016349 | 0.005662 | -0.007673 | 0.002473 | 0.000145 | -0.010415 | 0.005254 | -0.010354 | 0.014162 | 0.001917 | 0.001884 | 0.012674 | 0.005274 | 0.013463 | 0.001408 | -0.003909 | -0.006233 | -0.003306 | 0.000265 | 0.003559 | 0.000261 | -0.009818 | 0.003925 | -0.001245 | 0.005539 | -0.004848 | -0.018853 | 0.001284 | -0.001701 | 0.000614 | -0.001992 | -0.000735 | 0.007559 | -0.044203 | 0.174571 | 0.133671 | 0.386522 | -0.607210 | -0.078721 | -0.268838 | 0.015411 | 0.548330 | -0.158879 | -0.135339 |
| PC19 | 0.854801 | 0.002690 | -0.013152 | -0.006512 | -0.425251 | -0.142148 | -0.142148 | -0.006538 | 0.005472 | -0.000074 | -0.044903 | -0.073520 | 0.057635 | 0.003429 | -0.003167 | -0.000524 | -0.000942 | -0.003383 | 0.004583 | -0.001133 | 0.004452 | -9.180292e-03 | 0.008530 | -0.001448 | 0.003609 | 0.008451 | 0.003227 | -0.004216 | -0.013138 | -0.002346 | 0.003492 | 0.002790 | 0.003969 | 0.004253 | -0.004581 | -0.000185 | 0.006804 | -0.008276 | -0.004744 | -0.009823 | 0.005131 | -0.001837 | 0.011710 | 0.012091 | 0.004279 | 0.005835 | -0.010664 | -0.007739 | 0.001831 | -0.008960 | -0.007772 | -0.004172 | 0.004550 | 0.006425 | -0.005522 | -0.009358 | 0.012678 | -0.000016 | -0.004060 | -0.003345 | 0.002103 | 0.004375 | 0.016179 | 0.005851 | -0.000795 | -0.035727 | -0.004981 | 0.051709 | -0.051910 | 0.044888 | -0.042357 | 0.124234 | -0.084954 | 0.021674 | -0.034690 |
| PC20 | -0.101961 | 0.009273 | 0.003267 | 0.003015 | 0.015646 | 0.028116 | 0.028116 | -0.001221 | 0.002055 | 0.000026 | -0.261018 | -0.797604 | 0.517640 | 0.001533 | 0.008605 | 0.002680 | 0.001302 | 0.000494 | 0.004583 | -0.009283 | 0.009523 | -1.439095e-02 | 0.004943 | 0.009771 | 0.002059 | 0.003909 | 0.001537 | -0.005213 | 0.023545 | -0.008397 | -0.009994 | -0.005305 | -0.008607 | -0.002721 | -0.014824 | 0.008347 | -0.001922 | 0.010246 | -0.002418 | 0.001086 | 0.010327 | 0.004535 | 0.002024 | 0.004635 | -0.005893 | 0.014737 | -0.015802 | 0.006746 | -0.006368 | -0.001116 | -0.009168 | -0.001729 | 0.009814 | -0.000839 | -0.001335 | -0.002979 | -0.009917 | -0.007554 | -0.000115 | 0.010212 | -0.007326 | -0.000804 | -0.014658 | -0.022402 | -0.040422 | 0.001279 | -0.037896 | -0.039639 | 0.066804 | 0.028394 | -0.021353 | 0.022064 | -0.019309 | 0.015544 | -0.008841 |
| PC21 | -0.262205 | 0.003299 | -0.002180 | 0.002971 | -0.782546 | 0.377870 | 0.377870 | 0.006275 | 0.003826 | 0.001078 | -0.003902 | 0.005328 | -0.043064 | 0.001231 | -0.009137 | 0.015327 | -0.019939 | -0.004046 | -0.007628 | -0.015614 | -0.006825 | -5.102897e-03 | -0.000121 | 0.019736 | 0.006350 | 0.031704 | 0.009032 | 0.001932 | -0.008187 | -0.012083 | -0.010537 | 0.016169 | -0.007460 | 0.003440 | -0.004378 | 0.014251 | -0.011256 | -0.006156 | -0.005292 | -0.008761 | 0.007866 | 0.004966 | -0.003707 | -0.002420 | 0.000901 | 0.000472 | 0.005529 | -0.004571 | 0.024511 | 0.004821 | -0.006787 | -0.011525 | -0.010198 | 0.010461 | 0.010704 | 0.007266 | -0.012968 | -0.023458 | 0.001637 | -0.008795 | 0.037236 | -0.007688 | 0.070212 | 0.081427 | 0.076955 | -0.012416 | 0.021177 | 0.021832 | -0.014968 | -0.032089 | 0.038839 | -0.015165 | -0.039727 | -0.021475 | 0.019195 |
| PC22 | -0.016855 | -0.012007 | -0.000410 | 0.004821 | -0.105423 | 0.050508 | 0.050508 | 0.004367 | 0.006290 | -0.003398 | -0.000907 | 0.014322 | -0.071534 | -0.010611 | 0.001130 | -0.001633 | 0.009650 | -0.023011 | -0.000620 | 0.011066 | 0.007656 | -2.312926e-03 | 0.013870 | 0.008534 | -0.002420 | -0.005057 | 0.001241 | -0.004999 | -0.013181 | -0.007888 | -0.012417 | -0.003365 | -0.021443 | 0.013060 | 0.012980 | 0.002544 | 0.016374 | -0.020586 | 0.003277 | -0.020964 | 0.004730 | -0.008796 | 0.001370 | 0.011314 | -0.007781 | -0.004980 | 0.017037 | -0.004585 | -0.003361 | -0.011686 | -0.020037 | 0.024446 | 0.015494 | 0.004061 | 0.025908 | 0.020076 | 0.000848 | -0.025408 | -0.021901 | -0.000054 | 0.019409 | 0.007928 | -0.564044 | -0.572865 | -0.566901 | 0.007185 | -0.010630 | -0.017613 | 0.000675 | 0.012262 | -0.003910 | 0.005407 | -0.016631 | -0.007990 | 0.031039 |
| PC23 | -0.008520 | 0.004318 | 0.004586 | 0.001964 | -0.033199 | 0.014954 | 0.014954 | 0.005090 | -0.001379 | -0.000276 | 0.317088 | 0.438434 | 0.831360 | -0.016426 | 0.004795 | -0.005443 | -0.014477 | -0.000494 | 0.015571 | -0.019763 | 0.008864 | -1.632657e-02 | 0.012474 | 0.009443 | 0.006171 | 0.002117 | 0.015058 | 0.011634 | -0.014702 | -0.024617 | 0.014490 | 0.006515 | 0.013121 | 0.021396 | -0.003702 | -0.017563 | -0.008284 | 0.015346 | -0.033751 | -0.022411 | -0.010908 | -0.009473 | 0.003283 | 0.002997 | -0.006048 | 0.031706 | -0.000127 | 0.024670 | 0.015172 | -0.007831 | -0.005481 | 0.005735 | 0.001103 | -0.008570 | -0.003897 | 0.005559 | 0.018778 | -0.008854 | -0.004471 | -0.005361 | -0.007111 | 0.001647 | -0.033216 | -0.021888 | -0.029776 | -0.007939 | 0.038732 | -0.006313 | 0.000252 | 0.000437 | 0.005302 | -0.017919 | 0.018408 | 0.014052 | 0.020018 |
| PC24 | 0.006451 | 0.003365 | 0.000083 | -0.002448 | -0.003433 | -0.001784 | -0.001784 | -0.000639 | 0.000341 | 0.001720 | -0.003155 | -0.002689 | 0.007739 | -0.001787 | 0.007017 | -0.006495 | 0.082235 | -0.000824 | -0.024379 | -0.067833 | -0.014076 | -2.311306e-02 | 0.026425 | 0.063958 | -0.024772 | -0.023470 | 0.015903 | 0.000850 | 0.035774 | 0.033493 | 0.025259 | -0.116584 | -0.016398 | 0.030953 | -0.035579 | -0.002059 | -0.015698 | 0.024706 | 0.062316 | 0.059543 | -0.045291 | -0.001730 | 0.009432 | 0.005749 | -0.064360 | 0.036380 | 0.070387 | 0.091853 | -0.100579 | -0.030861 | 0.060597 | -0.000030 | -0.020177 | -0.077418 | 0.070717 | -0.038487 | -0.064974 | -0.034319 | -0.066400 | 0.004743 | 0.063044 | 0.012828 | 0.001931 | 0.001054 | 0.010165 | -0.384583 | -0.382215 | -0.385608 | -0.387395 | -0.379301 | 0.172217 | 0.181595 | 0.179298 | 0.170139 | 0.174187 |
| PC25 | 0.002234 | -0.001958 | 0.001145 | -0.002491 | 0.014817 | -0.001807 | -0.001807 | -0.004336 | 0.001555 | 0.000632 | 0.001854 | 0.008666 | 0.012453 | 0.007791 | -0.006136 | -0.022756 | -0.010360 | -0.006244 | 0.041240 | -0.030458 | -0.016504 | -4.652408e-03 | -0.025729 | 0.017340 | -0.070997 | -0.009635 | -0.040116 | -0.012435 | -0.041637 | -0.017185 | -0.038899 | 0.023483 | 0.025408 | 0.015866 | -0.000181 | 0.053026 | -0.099145 | 0.155009 | -0.020798 | 0.043593 | -0.015642 | -0.001579 | -0.015365 | 0.002645 | -0.014319 | 0.035895 | 0.123998 | -0.050772 | 0.051818 | 0.010430 | -0.003273 | 0.011480 | 0.035761 | 0.029058 | -0.049789 | -0.060985 | 0.042294 | -0.065889 | -0.018651 | 0.038505 | 0.027008 | -0.038993 | -0.013645 | -0.002120 | 0.003602 | -0.180386 | -0.160065 | -0.180970 | -0.176852 | -0.179386 | -0.383103 | -0.384295 | -0.389483 | -0.390388 | -0.386574 |
| PC26 | 0.008657 | -0.000624 | -0.002404 | -0.000382 | -0.007482 | 0.001632 | 0.001632 | -0.000405 | 0.000861 | -0.001135 | -0.004283 | -0.003821 | -0.020809 | -0.006781 | -0.005597 | -0.001180 | -0.411208 | -0.016033 | -0.015576 | 0.013979 | 0.001958 | -2.028126e-03 | 0.006095 | -0.142152 | -0.033737 | -0.008406 | 0.003467 | -0.003497 | -0.002381 | 0.001973 | -0.002458 | -0.028444 | -0.013186 | -0.010624 | -0.003813 | -0.029187 | 0.023900 | 0.884083 | 0.011902 | -0.038153 | -0.002538 | -0.004212 | 0.003972 | -0.009136 | -0.010422 | -0.032064 | -0.034848 | 0.007231 | -0.016831 | -0.008251 | -0.002649 | -0.007572 | -0.006498 | -0.018100 | 0.007142 | 0.006061 | -0.027843 | 0.005679 | -0.006368 | -0.016697 | -0.004350 | 0.007266 | -0.008737 | -0.011307 | -0.013021 | 0.023192 | 0.010886 | 0.011367 | 0.011314 | 0.028266 | 0.067190 | 0.046162 | 0.058036 | 0.052740 | 0.052994 |
| PC27 | -0.003205 | 0.000635 | -0.000213 | -0.003176 | -0.019605 | 0.008402 | 0.008402 | 0.000844 | 0.000945 | 0.001530 | -0.002290 | 0.007715 | 0.002913 | -0.013072 | -0.009864 | -0.013285 | 0.873222 | -0.016831 | -0.017086 | -0.042639 | -0.008913 | -1.265520e-02 | -0.005852 | -0.285991 | -0.063520 | 0.001541 | -0.004764 | -0.006192 | -0.025249 | -0.045843 | -0.019530 | -0.014962 | -0.014303 | -0.015855 | -0.046695 | -0.022680 | -0.012154 | 0.343545 | -0.045674 | -0.077941 | -0.001938 | -0.007857 | -0.023019 | -0.051078 | -0.009768 | -0.033844 | -0.031966 | -0.018492 | -0.000715 | -0.012446 | -0.015089 | -0.018816 | -0.009688 | -0.006710 | -0.020475 | -0.003691 | -0.037831 | -0.011805 | -0.005575 | -0.023538 | -0.018839 | -0.008363 | 0.003056 | 0.001132 | -0.000532 | 0.029697 | 0.030046 | 0.031483 | 0.034244 | 0.033190 | 0.010526 | 0.002676 | -0.000925 | 0.005285 | 0.011222 |
| PC28 | 0.007155 | -0.000168 | -0.000253 | -0.002929 | 0.007854 | -0.003661 | -0.003661 | 0.002401 | 0.003733 | 0.000732 | -0.001820 | 0.003288 | -0.012506 | -0.005858 | -0.009078 | -0.023269 | 0.149973 | -0.018143 | -0.019500 | -0.013721 | -0.003288 | 4.729367e-03 | -0.004868 | 0.867863 | -0.394548 | -0.013937 | -0.004523 | -0.010231 | -0.014306 | -0.023246 | -0.015841 | -0.024214 | -0.010580 | -0.025019 | -0.037611 | -0.027824 | -0.002275 | 0.175230 | -0.062129 | -0.083493 | 0.000480 | -0.013203 | -0.012409 | -0.041865 | -0.001887 | -0.053080 | -0.042460 | -0.016344 | -0.002032 | -0.008448 | -0.007467 | -0.023132 | -0.013908 | -0.004740 | -0.032890 | 0.002349 | -0.014851 | 0.007062 | -0.002156 | -0.027115 | -0.037354 | 0.001385 | 0.001335 | -0.000371 | 0.007312 | 0.040871 | 0.046520 | 0.036815 | 0.030649 | 0.045247 | 0.016573 | 0.008922 | 0.005372 | 0.014136 | 0.012901 |
| PC29 | 0.002743 | -0.001796 | -0.002971 | 0.001995 | 0.010202 | -0.001417 | -0.001417 | -0.001759 | -0.001863 | 0.001877 | -0.008759 | 0.003605 | -0.013273 | -0.018968 | -0.021281 | -0.034886 | 0.083708 | -0.027904 | -0.017330 | -0.031349 | -0.007649 | -2.856305e-02 | -0.018433 | 0.327150 | 0.895219 | -0.048847 | -0.023210 | -0.012529 | -0.040633 | -0.026113 | -0.021170 | -0.061590 | -0.013072 | -0.020439 | -0.111426 | -0.037522 | -0.036834 | 0.118521 | -0.097981 | -0.080278 | -0.023672 | -0.014580 | -0.033366 | -0.076232 | -0.032489 | -0.064305 | -0.015041 | -0.037578 | -0.009539 | -0.015229 | -0.012423 | -0.017123 | -0.012329 | -0.030051 | -0.029490 | -0.012526 | -0.030228 | -0.025178 | -0.016241 | -0.034320 | -0.016918 | -0.022508 | -0.003064 | -0.003441 | 0.003115 | -0.004985 | -0.003802 | -0.011048 | -0.003408 | 0.007743 | -0.017380 | -0.014288 | -0.008514 | -0.015666 | -0.032711 |
| PC30 | 0.002280 | 0.001735 | 0.003165 | -0.000195 | -0.002247 | -0.000191 | -0.000191 | -0.001590 | -0.001103 | -0.001104 | -0.007140 | -0.009091 | 0.003129 | -0.003443 | -0.003037 | -0.018507 | 0.021226 | -0.011801 | -0.016662 | -0.112039 | -0.007676 | -2.372769e-02 | -0.008587 | 0.024430 | 0.048795 | -0.014579 | -0.003739 | -0.004594 | -0.009573 | -0.066978 | -0.011060 | -0.174027 | -0.006430 | -0.005619 | 0.954444 | 0.002705 | -0.036007 | 0.010774 | -0.127772 | -0.074740 | -0.006539 | -0.004723 | -0.017569 | -0.078983 | -0.020825 | 0.004772 | -0.017287 | -0.013984 | -0.013657 | -0.000203 | -0.004614 | -0.016043 | -0.005084 | -0.013403 | -0.010134 | -0.006481 | -0.060826 | -0.008073 | -0.003912 | -0.006611 | -0.009475 | -0.006234 | 0.006077 | 0.008468 | 0.005960 | -0.008616 | -0.002967 | -0.010082 | -0.006863 | 0.000393 | 0.000229 | 0.007021 | 0.003085 | 0.001787 | -0.001070 |
| PC31 | -0.007011 | -0.001349 | -0.000637 | -0.000126 | 0.005138 | 0.000947 | 0.000947 | 0.000982 | -0.003063 | -0.000407 | -0.003157 | 0.001462 | -0.014421 | -0.004660 | -0.002762 | -0.000875 | 0.011906 | 0.002683 | -0.002828 | -0.062750 | -0.008137 | -2.574440e-07 | -0.005276 | 0.000249 | 0.006578 | -0.002538 | -0.001594 | -0.000293 | 0.004551 | -0.025550 | -0.006362 | -0.128624 | -0.008222 | -0.006373 | -0.017656 | -0.016400 | 0.012206 | 0.010479 | -0.390012 | 0.010949 | -0.001709 | 0.004063 | -0.036817 | 0.899527 | -0.003997 | -0.054193 | -0.008451 | -0.007081 | -0.007557 | -0.007115 | -0.000195 | 0.006101 | -0.001674 | -0.006806 | -0.011292 | -0.006423 | -0.101818 | -0.000837 | 0.003150 | -0.014379 | -0.008551 | -0.002655 | 0.010086 | 0.004830 | 0.000610 | -0.001295 | -0.000275 | 0.005133 | 0.007564 | 0.001327 | 0.003213 | -0.008999 | -0.002163 | 0.001047 | 0.007043 |
| PC32 | 0.003830 | -0.002031 | 0.001410 | 0.000066 | -0.010646 | 0.000572 | 0.000572 | -0.001231 | 0.000736 | -0.000235 | -0.007296 | 0.001035 | -0.004493 | -0.004296 | -0.001014 | -0.006082 | 0.000982 | 0.001329 | 0.011137 | 0.135694 | 0.001714 | 1.125818e-02 | 0.009061 | -0.012786 | -0.028005 | 0.027351 | 0.012545 | 0.005720 | 0.011170 | -0.014651 | 0.017786 | -0.074217 | -0.005587 | 0.004438 | -0.046261 | 0.007331 | 0.028782 | 0.022091 | -0.643522 | 0.683687 | 0.008642 | 0.001843 | 0.004986 | -0.283767 | -0.005925 | -0.013779 | 0.003086 | 0.015528 | -0.003603 | -0.010327 | -0.002144 | -0.004123 | 0.009182 | 0.000180 | 0.027431 | -0.000034 | 0.054314 | 0.017773 | -0.008058 | 0.005905 | 0.013320 | 0.013114 | -0.011094 | -0.011070 | -0.004845 | 0.009296 | 0.000909 | 0.003625 | 0.012198 | 0.011344 | 0.020539 | 0.010385 | 0.007310 | 0.016349 | 0.022958 |
| PC33 | 0.004330 | -0.004394 | -0.001905 | 0.002624 | -0.005842 | 0.001173 | 0.001173 | -0.000514 | -0.000260 | 0.001975 | 0.010763 | 0.009043 | 0.011662 | -0.031149 | -0.016903 | -0.033162 | 0.040733 | -0.026522 | -0.007762 | -0.480230 | 0.001455 | -1.303701e-02 | -0.008094 | 0.061706 | 0.041492 | -0.005518 | -0.012810 | -0.005128 | -0.050513 | -0.127166 | -0.011431 | -0.094254 | -0.015911 | -0.027145 | 0.040279 | -0.057086 | -0.019200 | 0.044967 | 0.508161 | 0.627295 | 0.000759 | -0.015455 | -0.045008 | 0.154138 | -0.008114 | -0.122342 | -0.055707 | -0.032993 | 0.020786 | -0.023928 | -0.021814 | -0.021890 | -0.013502 | -0.006497 | -0.037947 | 0.001322 | 0.052733 | -0.017001 | -0.001276 | -0.079870 | -0.030587 | -0.012948 | -0.010545 | -0.011389 | -0.002060 | 0.046943 | 0.048440 | 0.042620 | 0.046551 | 0.045340 | 0.000129 | -0.000450 | -0.005161 | -0.012464 | -0.004246 |
| PC34 | -0.002510 | 0.000711 | -0.001309 | -0.000196 | -0.016082 | 0.005265 | 0.005265 | 0.001389 | 0.002863 | 0.004306 | 0.006468 | 0.011037 | 0.025604 | -0.003965 | -0.004857 | -0.006901 | 0.041610 | -0.007885 | 0.003665 | 0.717160 | -0.006403 | -6.095137e-02 | -0.006785 | 0.016754 | 0.006819 | 0.005695 | -0.009299 | -0.003352 | -0.037003 | -0.131099 | -0.030432 | -0.556080 | 0.004214 | -0.010384 | 0.017444 | -0.031052 | -0.051448 | -0.002112 | 0.305844 | 0.138330 | -0.003505 | -0.003210 | -0.019185 | 0.082142 | 0.008871 | -0.004515 | -0.019604 | -0.020288 | 0.021828 | -0.016086 | -0.006875 | -0.014984 | -0.009210 | 0.014553 | -0.044333 | -0.008921 | -0.111408 | -0.034867 | -0.010055 | -0.063250 | -0.009938 | -0.021414 | 0.004373 | 0.005169 | 0.006453 | 0.012701 | 0.016526 | 0.022001 | 0.009814 | -0.000497 | -0.019422 | -0.024995 | -0.020003 | -0.009584 | -0.025764 |
| PC35 | 0.010772 | 0.002398 | -0.000014 | -0.001951 | 0.004019 | -0.005620 | -0.005620 | -0.002334 | -0.001618 | -0.001312 | 0.007962 | 0.001580 | 0.011897 | -0.021773 | -0.009033 | -0.061462 | 0.021358 | -0.029541 | -0.011027 | 0.298466 | -0.009067 | -1.943133e-02 | -0.002874 | 0.039343 | 0.035034 | -0.035271 | -0.003499 | -0.010988 | -0.023373 | -0.210760 | 0.002270 | 0.676081 | -0.012274 | -0.009639 | 0.127384 | -0.015367 | -0.054447 | 0.018989 | 0.091837 | 0.185859 | -0.015762 | -0.024107 | -0.094392 | 0.079634 | -0.071895 | -0.071768 | -0.008844 | 0.023690 | -0.025677 | -0.029764 | 0.004367 | -0.029788 | -0.004852 | -0.036732 | -0.018086 | -0.009679 | -0.546203 | 0.003067 | -0.022895 | -0.040654 | -0.020172 | 0.004654 | -0.007090 | -0.005357 | 0.011883 | -0.034973 | -0.012672 | -0.025834 | -0.009657 | -0.012520 | 0.005368 | 0.003066 | 0.005123 | 0.009222 | 0.010865 |
| PC36 | -0.008210 | 0.004648 | -0.000660 | 0.000074 | 0.002034 | 0.002873 | 0.002873 | -0.000872 | 0.003299 | 0.003738 | -0.008191 | -0.015483 | -0.009922 | -0.009217 | -0.001273 | -0.013533 | 0.016255 | -0.025659 | -0.069328 | 0.226617 | -0.024784 | -3.988763e-02 | -0.002319 | 0.006802 | 0.019142 | -0.013462 | -0.008401 | -0.009896 | 0.021070 | -0.445448 | -0.037673 | 0.277601 | -0.027558 | -0.026940 | 0.087461 | -0.050894 | -0.066955 | 0.019894 | 0.004645 | -0.023473 | -0.025243 | -0.005185 | -0.079845 | 0.114757 | -0.043369 | -0.166087 | -0.022600 | 0.022994 | -0.071162 | -0.022160 | 0.009379 | -0.020569 | -0.023571 | -0.056080 | 0.013854 | -0.027038 | 0.753106 | -0.045441 | -0.022411 | -0.039656 | -0.017172 | -0.009838 | 0.011622 | 0.006233 | 0.000393 | -0.040484 | -0.034981 | -0.030854 | -0.032027 | -0.037142 | 0.034346 | 0.030938 | 0.034090 | 0.021749 | 0.036568 |
| PC37 | -0.003407 | 0.003765 | 0.002770 | -0.001647 | -0.012369 | 0.006450 | 0.006450 | 0.002628 | -0.003164 | -0.003971 | -0.002851 | 0.002880 | 0.026276 | -0.030010 | -0.006986 | -0.080210 | 0.050377 | -0.056809 | -0.099001 | 0.187524 | -0.019640 | -1.279465e-01 | -0.007787 | 0.018294 | 0.014475 | -0.043434 | 0.007523 | -0.012242 | -0.065984 | 0.818596 | -0.065262 | 0.169674 | -0.041440 | -0.003305 | 0.121416 | -0.063191 | -0.186536 | 0.025851 | 0.073373 | 0.117819 | -0.007056 | -0.016696 | -0.145253 | 0.099820 | -0.104669 | -0.123400 | -0.065689 | -0.031577 | -0.036111 | -0.026561 | -0.024946 | -0.051398 | -0.015167 | -0.053690 | -0.046202 | -0.018333 | 0.221347 | -0.056558 | -0.030256 | -0.091690 | -0.051583 | -0.016295 | -0.002305 | 0.002034 | -0.005357 | -0.000094 | 0.008320 | -0.003102 | 0.006674 | -0.002600 | 0.009921 | 0.003924 | -0.006778 | 0.001981 | 0.014442 |
| PC38 | -0.002514 | -0.001758 | -0.002199 | -0.000352 | -0.000903 | 0.000852 | 0.000852 | 0.001961 | 0.002870 | 0.000325 | -0.007739 | -0.005566 | -0.034063 | -0.016001 | -0.018143 | -0.058596 | 0.017627 | -0.014862 | -0.042350 | 0.000880 | -0.020821 | 2.547455e-02 | -0.003990 | 0.036147 | 0.025299 | -0.037617 | -0.018149 | -0.006695 | -0.053547 | -0.032287 | -0.055172 | 0.064921 | -0.014311 | -0.041176 | 0.005479 | -0.121030 | -0.016354 | 0.038039 | 0.009666 | 0.061845 | -0.006846 | -0.012966 | -0.198714 | 0.059768 | -0.022257 | 0.922690 | -0.047832 | -0.027878 | -0.014629 | -0.024878 | -0.008350 | -0.047620 | -0.020993 | -0.025375 | -0.039428 | -0.011610 | 0.089897 | -0.018776 | -0.017423 | -0.192431 | -0.043378 | -0.001440 | 0.002444 | -0.010906 | 0.000986 | 0.019277 | 0.011232 | 0.006902 | 0.021038 | 0.018379 | 0.004748 | 0.006424 | 0.019631 | 0.011183 | 0.018652 |
| PC39 | 0.003929 | -0.001263 | -0.001495 | 0.003147 | -0.009314 | -0.000186 | -0.000186 | 0.001949 | 0.001287 | 0.000881 | 0.002248 | 0.004350 | 0.004146 | -0.001602 | -0.001939 | -0.013577 | -0.000190 | -0.002931 | 0.004321 | 0.003766 | 0.003225 | -5.114429e-02 | 0.000481 | 0.006800 | -0.001398 | -0.029064 | 0.004489 | 0.001942 | -0.053676 | 0.002219 | -0.010255 | -0.040398 | 0.001106 | 0.002346 | -0.003900 | -0.108216 | 0.248168 | 0.003270 | 0.028165 | 0.008149 | -0.001491 | -0.000448 | -0.646125 | -0.013556 | 0.025111 | -0.004769 | -0.006019 | -0.015752 | 0.005599 | 0.001816 | 0.001335 | -0.034707 | -0.014161 | 0.013511 | 0.007461 | 0.001731 | -0.002345 | -0.033252 | -0.003487 | 0.703591 | 0.019383 | -0.003468 | -0.001107 | 0.005950 | 0.003490 | 0.007850 | 0.008195 | 0.002718 | -0.009802 | 0.000108 | 0.004664 | 0.005407 | 0.006922 | 0.004660 | 0.003226 |
| PC40 | -0.004994 | 0.003918 | 0.000268 | -0.002597 | -0.000215 | 0.001214 | 0.001214 | -0.004198 | 0.001165 | -0.002257 | -0.002809 | 0.001513 | 0.018650 | 0.001486 | 0.001383 | -0.080395 | 0.005206 | 0.014184 | 0.005306 | 0.029499 | -0.007492 | -4.068405e-02 | -0.009341 | -0.004744 | 0.001804 | -0.105325 | -0.006706 | -0.001227 | -0.033954 | 0.051899 | -0.020806 | 0.006793 | 0.015302 | -0.002340 | 0.025066 | 0.278361 | 0.787072 | -0.010183 | 0.012666 | -0.019379 | -0.002466 | 0.000875 | -0.215148 | -0.012990 | -0.025000 | -0.098281 | 0.069283 | 0.008628 | 0.000190 | 0.008933 | 0.001160 | -0.063483 | -0.000115 | -0.016540 | -0.003520 | -0.033939 | 0.027732 | -0.048840 | -0.008061 | -0.447123 | -0.017019 | -0.018645 | 0.007067 | 0.007317 | 0.010008 | -0.007820 | -0.016621 | -0.019176 | -0.026037 | -0.020823 | -0.023158 | -0.021244 | -0.023359 | -0.034444 | -0.022844 |
| PC41 | -0.017241 | -0.001347 | 0.004120 | 0.001717 | -0.007824 | 0.005576 | 0.005576 | -0.000685 | 0.000921 | -0.000361 | 0.004980 | -0.002522 | -0.004415 | -0.025307 | -0.013295 | -0.283536 | 0.026767 | -0.037962 | -0.101377 | 0.061680 | -0.034627 | -1.068329e-01 | -0.024320 | 0.014364 | 0.018748 | -0.014240 | -0.022236 | -0.008504 | -0.136409 | 0.046236 | -0.053911 | 0.068989 | -0.042129 | -0.049348 | 0.045138 | -0.313215 | 0.423621 | 0.002948 | 0.036560 | 0.048352 | -0.029794 | -0.020699 | 0.621273 | 0.057135 | -0.128260 | 0.100526 | -0.028456 | -0.048538 | -0.015483 | -0.036812 | -0.004351 | -0.068750 | -0.025457 | -0.031507 | -0.075487 | -0.025384 | 0.045801 | -0.100765 | -0.019523 | 0.349523 | -0.015879 | -0.022313 | 0.005714 | 0.003927 | 0.012291 | -0.018286 | -0.003596 | 0.002768 | 0.002496 | 0.000099 | -0.013280 | -0.013926 | -0.015535 | -0.015121 | -0.008289 |
| PC42 | -0.002525 | -0.003185 | -0.004558 | 0.002157 | -0.000417 | -0.000459 | -0.000459 | 0.001497 | 0.001173 | 0.003273 | 0.008834 | 0.008916 | -0.002457 | -0.023022 | -0.014035 | -0.139646 | 0.011311 | -0.029339 | -0.069873 | 0.010757 | -0.003641 | -8.393217e-02 | 0.000387 | 0.015951 | 0.010119 | -0.204026 | -0.014689 | 0.002483 | -0.227090 | -0.022966 | 0.007209 | 0.003801 | -0.032066 | -0.034339 | -0.010733 | 0.830517 | -0.125427 | 0.025533 | 0.023399 | 0.041686 | -0.024347 | -0.005162 | 0.139428 | 0.043479 | -0.024710 | 0.122022 | -0.181137 | -0.000686 | -0.025111 | -0.052353 | -0.002678 | -0.015104 | -0.024690 | -0.078260 | -0.039510 | -0.007386 | 0.040523 | -0.014499 | -0.010390 | 0.266023 | -0.042114 | -0.009247 | 0.006886 | -0.002903 | -0.003160 | 0.014377 | 0.010128 | 0.015952 | 0.018697 | 0.015915 | 0.030333 | 0.018765 | 0.023986 | 0.028457 | 0.026325 |
| PC43 | -0.003584 | -0.002184 | 0.003622 | 0.001707 | 0.006020 | -0.001022 | -0.001022 | -0.001808 | 0.001661 | 0.000433 | -0.004022 | -0.006026 | 0.003622 | -0.014639 | -0.007693 | 0.841166 | 0.010169 | -0.000684 | -0.008429 | 0.020187 | -0.016061 | 1.883028e-02 | -0.007927 | 0.011917 | 0.000079 | -0.250484 | 0.002943 | -0.004935 | -0.410379 | 0.019464 | -0.000325 | 0.036026 | -0.017872 | -0.030619 | 0.021415 | -0.067309 | 0.091211 | 0.000259 | 0.001043 | 0.027995 | -0.003820 | -0.009818 | 0.102125 | 0.011551 | -0.133302 | 0.018990 | 0.007062 | 0.013431 | -0.023506 | -0.042670 | 0.002718 | -0.058795 | -0.010071 | -0.038869 | -0.038603 | -0.026960 | 0.008947 | -0.026791 | -0.020471 | 0.032670 | -0.032097 | 0.004362 | 0.001511 | -0.001858 | 0.009038 | -0.006379 | 0.001544 | -0.004197 | -0.002291 | 0.000800 | -0.004391 | -0.004666 | -0.011812 | -0.001349 | 0.006172 |
| PC44 | 0.004733 | 0.002138 | 0.002789 | 0.003656 | -0.012127 | 0.003962 | 0.003962 | 0.000173 | -0.002474 | 0.000967 | -0.004414 | -0.004121 | 0.023443 | 0.003196 | 0.006518 | -0.144234 | 0.002921 | -0.000056 | -0.038880 | 0.045610 | -0.011374 | 9.314519e-01 | -0.001687 | -0.014626 | 0.003437 | -0.174350 | 0.015400 | -0.002304 | -0.095228 | 0.040609 | -0.070696 | -0.023385 | 0.011078 | 0.019565 | 0.016809 | -0.025995 | 0.002926 | -0.009643 | 0.015246 | 0.001362 | 0.004330 | 0.009988 | -0.022315 | 0.001835 | -0.123291 | -0.064773 | -0.101133 | -0.001885 | -0.014924 | -0.027831 | -0.005109 | 0.041400 | 0.000004 | -0.030159 | -0.040893 | -0.017974 | 0.016722 | -0.137902 | -0.021302 | 0.020465 | 0.031667 | -0.020221 | 0.003313 | 0.004215 | 0.000765 | -0.003806 | -0.004798 | 0.003395 | -0.002801 | -0.017884 | 0.004526 | -0.002955 | 0.007082 | 0.009424 | -0.004022 |
| PC45 | -0.007037 | -0.000588 | -0.003001 | -0.000900 | 0.023844 | -0.008984 | -0.008984 | 0.000800 | -0.000875 | -0.004842 | -0.009784 | -0.011088 | -0.003861 | 0.025667 | 0.006733 | -0.052681 | -0.017058 | 0.019719 | 0.067263 | -0.007951 | 0.021809 | 6.575988e-02 | -0.003917 | 0.002922 | 0.006188 | 0.775773 | -0.004124 | 0.004542 | -0.552747 | -0.014590 | 0.077249 | -0.002461 | -0.015595 | 0.004618 | -0.005434 | 0.051207 | 0.013205 | -0.009014 | -0.010824 | -0.041728 | -0.000340 | 0.009451 | -0.067411 | -0.005054 | -0.184245 | -0.019851 | -0.083473 | 0.023170 | -0.052949 | 0.006610 | 0.010953 | 0.010027 | 0.000481 | -0.048900 | 0.064082 | 0.010404 | 0.002987 | 0.045318 | -0.007829 | -0.051864 | -0.066193 | 0.000243 | 0.001542 | -0.002336 | -0.001143 | -0.009732 | -0.008592 | -0.010496 | -0.023025 | -0.018323 | 0.003791 | 0.001840 | 0.008527 | 0.005550 | -0.009302 |
plt.figure(figsize=(8, 6))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7, align='center')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.title('Explained Variance by Each Principal Component')
plt.show()
Plot above displays the explained variance by each PC, from 1 to 45, the Y axis shows the ammount of variance explained by each principal component, which in the plot proof that PC1 and PC2 are the largest amount of variance from the dataset